Exemplo n.º 1
0
def draw_bcm_curve(curve_type, input_params, resolution):
    """Draws a BCM curve for you.
    
    If you passed in the `input_params` as `con` then it randomly draws a 
    theory consistent curve; `inc` - theory inconsistent curve.  If instead 
    you passed in `[y1, x1, x2, y2, y3, y4]` into `input_params` then it 
    draws a curve directly rather then randomly generating one for you.
    """
    if resolution <= 0:
        raise ValueError('Resolution will need to > 0!')

    # This draws a BCM curve for you. If you passed in the 'input_params' as 'con' then it randomly draws a
    # theory consistent curve; 'inc' - theory inconsistent curve
    if (input_params == 'con') or (input_params == 'inc'):
        input_params = common_to_all_curves(curve_type, 'auto_generate',
                                            input_params, resolution)

    nParams = family_of_curves(curve_type, 'get_nParams')
    if (not input_params) or (np.shape(input_params)[1] != nParams):
        raise ValueError('Not a valid input matrix!')

    # If instead you passed in [y1, x1, x2, y2, y3 and y4] into 'input_params' then it draws a curve directly
    # rather then randomly generating one for you
    out = family_of_curves(curve_type, 'get_curve_xy_vals', input_params)

    fig, ax = plt.subplots()
    ax.plot(out['xval'], out['yval'])

    ax.set(xlabel='Activation',
           ylabel='Change in Memory Strength',
           title=out['title_string'])
    ax.set_ylim(-1.2, 1.2)
    ax.grid()
    plt.show()
    return out
Exemplo n.º 2
0
def initial_sampling(curve_type, nParticles, resolution):
    """Uniformly sampling each curve parameter bounded by its respective bounds.
    
    **Arguments**:  
    - curve_type: type of curve, here specifying relevant bounds
    - nParticles: number of parameter particles to sample  
    - resolution: number of decimals output will be rounded to
    
    **Returns** nParticles by nParams array containing sampled parameters.
    """

    if nParticles <= 0:
        raise ValueError('Number of particles will need to > 0!')

    if resolution <= 0:
        raise ValueError('Resolution will need to > 0!')

    bounds = family_of_curves(curve_type, 'get_bounds')
    nParams = family_of_curves(curve_type, 'get_nParams')
    out = np.full((nParticles, nParams), np.nan)

    # Uniform sampling each curve parameter bounded by its respective bounds
    for i in range(nParams):
        out[:, i] = np.random.uniform(low=bounds[i, 0],
                                      high=bounds[i, 1],
                                      size=(nParticles))

    out = np.round_(out, resolution)

    if np.any(np.isnan(out)):
        raise ValueError('NaNs in initial sampling output matrix!')

    return out
Exemplo n.º 3
0
def sort_horizontal_params(curve_type, input_params):
    "Ensures that x1 <= x2, especially for the horz_indpnt family of curves"

    nParams = family_of_curves(curve_type, 'get_nParams')
    if (input_params.size == 0) or (np.shape(input_params)[1] != nParams):
        raise ValueError('Not a valid input matrix!')

    out = input_params
    horizontal_params = family_of_curves(curve_type,
                                         'get_horizontal_params_only')
    if len(horizontal_params) != 2:
        raise ValueError(
            'Incorrect horizontal parameters count for {} family of curves'.
            format(curve_type))

    # This piece of code ensures that x1 <= x2 especially for the horz_indpnt family of curves
    idx = input_params[:, horizontal_params[
        0]] > input_params[:, horizontal_params[1]]
    out[idx, horizontal_params[0]] = input_params[idx, horizontal_params[1]]
    out[idx, horizontal_params[1]] = input_params[idx, horizontal_params[0]]

    if not np.all(
            out[:, horizontal_params[0]] <= out[:, horizontal_params[1]]):
        raise ValueError(
            'Horizontal parameter 1 is NOT <= Horizontal parameter 2 in {} family of curves'
            .format(curve_type))

    return out
Exemplo n.º 4
0
def check_if_exceed_bounds(curve_type, params):
    """If a curve parameter is found to exceeding bounds then it is set to the 
    bounds. For example, if a vertical parameter is -1.02 then it is set to -1 
    since -1 is the lower bound for vertical parameters.
    
    **Arguments**:  
    - curve_type: type of curve, here specifying number of relevant params
    - params: data matrix to be checked
    
    **Returns** the modified data matrix.
    """

    nParams = family_of_curves(curve_type, 'get_nParams')
    if (params.size == 0) or (np.shape(params)[1] != nParams):
        raise ValueError('Not a valid input matrix!')

    bounds = family_of_curves(curve_type, 'get_bounds')
    nParams = family_of_curves(curve_type, 'get_nParams')

    # If a curve parameter is found to exceeding bounds then it is set to the bounds
    # E.g. if a vertical parameter is -1.02 then it is set to -1 since -1 is the lower bound for vertical parameters
    for i in range(nParams):
        params[:, i] = np.fmax(params[:, i], bounds[i, 0])
        params[:, i] = np.fmin(params[:, i], bounds[i, 1])

    return params
Exemplo n.º 5
0
def flip_vertical_params(curve_type, input_params):
    """Flipping vertical parameters of the curve. 
    If a y1 = -0.4, flipping it will result in 0.4.
    """
    nParams = family_of_curves(curve_type, 'get_nParams')
    if (not input_params) or (np.shape(input_params)[1] != nParams):
        raise ValueError('Not a valid input matrix!')

    out = input_params
    vertical_params = family_of_curves(curve_type, 'get_vertical_params_only')

    # Flipping vertical parameters of the curve. If a y1 = -0.4, flipping it will result in 0.4
    for i in range(len(vertical_params)):
        out[:, vertical_params[i]] = np.multiply(
            input_params[:, vertical_params[i]], -1)

    return out
Exemplo n.º 6
0
def curve_volumes(curve_type, resolution):
    """Applies Lebesgue measure to compute curve volume over Euclidean space 
    for arbitrary dimensionality based on associated bounds.
    """

    if resolution <= 0:
        raise ValueError('Resolution will need to > 0!')

    bounds = family_of_curves(curve_type, 'get_bounds')
    nParams = family_of_curves(curve_type, 'get_nParams')
    total_vol = 1

    # Lebesgue measure http://en.wikipedia.org/wiki/Lebesgue_measure
    for i in range(nParams):
        total_vol = total_vol * len(
            np.arange(bounds[i, 0], bounds[i, 1],
                      1 / np.power(10, resolution)))

    return total_vol
Exemplo n.º 7
0
def preprocessing_setup(data, analysis_settings):
    """
    Performs sanity checks on the input data and the algorithm parameter struct. Massages the data (i.e. drop outliers,
    zscore data, etc).

    **Arguments**:  
    - data: Input data matrix (total number of trials x 6 columns)  
    - analysis_settings: Struct with algorithm parameters  

    **Returns**:  
    - data: Input data matrix (if applicable, outlier free, zscored, category specific data only, etc)  
    - analysis_settings: Struct with algorithm parameters; some additional parameters are added to this struct as well  
    """

    print('********** START OF MESSAGES **********')

    # Checks if the data matrix has 6 columns
    number_of_columns = np.shape(data)[1]
    if number_of_columns != 6:
        raise ValueError(
            'Incorrect number of columns ({}) in the input matrix!'.format(
                number_of_columns))

    # Registering which column in the data matrix is carrying which piece of information
    if (not ('data_matrix_columns' in analysis_settings)) or (
            not analysis_settings['data_matrix_columns']):
        # Setting it to the default
        analysis_settings['data_matrix_columns'] = {}
        analysis_settings['data_matrix_columns']['subject_id'] = 0
        analysis_settings['data_matrix_columns']['trials'] = 1
        analysis_settings['data_matrix_columns']['category'] = 2
        analysis_settings['data_matrix_columns']['predictor_var'] = 3
        analysis_settings['data_matrix_columns']['dependent_var'] = 4
        analysis_settings['data_matrix_columns']['net_effect_clusters'] = 5

    subject_id_column = analysis_settings['data_matrix_columns']['subject_id']
    trials_column = analysis_settings['data_matrix_columns']['trials']
    category_column = analysis_settings['data_matrix_columns']['category']
    predictor_var_column = analysis_settings['data_matrix_columns'][
        'predictor_var']
    dependent_var_column = analysis_settings['data_matrix_columns'][
        'dependent_var']
    net_effect_clusters_column = analysis_settings['data_matrix_columns'][
        'net_effect_clusters']

    # Checks if the em iterations is specified; if not specified then it is set to a default of 20
    if (not ('em_iterations' in analysis_settings)) or (
            analysis_settings['em_iterations'] <= 0):
        analysis_settings['em_iterations'] = 20
        print('Missing number of iterations! It is set to a default of {}'.
              format(analysis_settings['em_iterations']))

    # Checks if the no. of particles is specified; if not specified then it is set to a default of 1000
    if (not ('particles'
             in analysis_settings)) or (analysis_settings['particles'] <= 0):
        analysis_settings['particles'] = 100000
        print(
            'Missing number of particles! It is set to a default of {}'.format(
                analysis_settings['particles']))

    # Checks if the family of curves is specified; if not then set to 'horz_indpnt' (Refer to family of curves)
    if (not ('curve_type'
             in analysis_settings)) or (not analysis_settings['curve_type']):
        analysis_settings['curve_type'] = 'horz_indpnt'
        print('Missing family of curves! It is set to a default of {}'.format(
            analysis_settings['curve_type']))

    # Checks if the family of curves exist by fetching the number of curve parameters. This is just a sanity check
    if not isinstance(
            family_of_curves(analysis_settings['curve_type'], 'get_nParams'),
            int):
        raise ValueError(
            '{} - Does not exist! Check family_of_curves.m script'.format(
                analysis_settings['curve_type']))

    # Checks if the distribution is specified;
    # If not specified and if the dependent variable is binary it's set to 'bernoulli'; otherwise set to to 'normal'
    if (not ('distribution'
             in analysis_settings)) or (not analysis_settings['distribution']):
        if len(np.unique(data[:, dependent_var_column])) == 2:
            analysis_settings['distribution'] = 'bernoulli'
        else:
            analysis_settings['distribution'] = 'normal'
        print(
            'Missing distribution! based on the dependent variable it is set to {}'
            .format(analysis_settings['distribution']))

    # Checks if the distribution specific parameters exist
    if (not ('dist_specific_params' in analysis_settings)) or (
            not analysis_settings['dist_specific_params']):
        if analysis_settings['distribution'] == 'bernoulli':

            # For a Bernoulli dist there are no parameters so it is empty. We still need the struct to exist
            analysis_settings['dist_specific_params'] = {}

        elif analysis_settings['distribution'] == 'normal':

            # For normal distribution the additional parameter is sigma. We pass in sigma here.
            analysis_settings['dist_specific_params'] = {}
            analysis_settings['dist_specific_params'][
                'sigma'] = 1  # Default is 1
            print('Missing sigma for normal distribution! It is set to {}'.
                  format(analysis_settings['dist_specific_params']['sigma']))

    # Checks if normal distribution specific parameter is valid i.e. sigma > 0
    if (analysis_settings['distribution'] == 'normal') and (
            analysis_settings['dist_specific_params']['sigma'] <= 0):
        raise ValueError(
            'Normal distribution sigma will need to > 0! sigma = {}'.format(
                analysis_settings['dist_specific_params']['sigma']))

    # Checks if beta_0 is specified; if not specified then it is set to a default of 0
    if not ('beta_0' in analysis_settings):
        analysis_settings['beta_0'] = 0
        print(
            'Missing initial setting for beta_0! It is set to a default of {}'.
            format(analysis_settings['beta_0']))

    # Checks if beta_1 is specified; if not specified then it is set to a default of 1
    if not ('beta_1' in analysis_settings):
        analysis_settings['beta_1'] = 1
        print(
            'Missing initial setting for beta_1! It is set to a default of {}'.
            format(analysis_settings['beta_1']))

    # Checks if tau is specified; if not specified then it is set to a default of 0.05
    if not ('tau' in analysis_settings):
        analysis_settings['tau'] = 0.05
        print('Missing initial setting for tau! It is set to a default of {}'.
              format(analysis_settings['tau']))

    # Checks if this is a bootstrap run; if not specified then it is set to a default of false
    if not ('bootstrap' in analysis_settings):
        analysis_settings['bootstrap'] = False
        print(
            'Missing initial setting for beta_1! It is set to a default of {}'.
            format(analysis_settings['bootstrap']))

    # Checks if bootstrap flag is boolean
    if not (type(analysis_settings['bootstrap']) == bool):
        raise ValueError(
            'analysis_settings.bootstrap field will need to be boolean!')

    # Checks if this is a scramble run; if not specified then it is set to a default of false
    if not ('scramble' in analysis_settings):
        analysis_settings['scramble'] = False

    # Checks if scramble flag is boolean
    if not (type(analysis_settings['scramble']) == bool):
        raise ValueError(
            'analysis_settings.scramble field will need to be boolean!')

    # Errors if both bootstrap and scramble flags exist
    if analysis_settings['scramble'] and analysis_settings['bootstrap']:
        raise ValueError(
            'Cannot run both scramble AND bootstrap analyses at the same time! Set any one flag to be false'
        )

    # Builds a bootstrap data matrix from the original data matrix
    if analysis_settings['bootstrap'] and not (analysis_settings['scramble']):

        # We need a bootstrap sample number
        if (not ('bootstrap_run' in analysis_settings)) or (
                not analysis_settings['bootstrap_run']):
            raise ValueError(
                'Missing bootstrap sample number! set analysis_settings.bootstrap_run to a valid sample number'
            )

        bootstrap_data = []
        new_cluster_count = 1
        new_subject_count = 1

        # Get the number of subjects from the data matrix
        number_of_subjects = len(np.unique(data[:, subject_id_column]))

        # Randomly sample with replacement the number of subjects thus generating our bootstrap sample
        subj_num_with_replacement = random.choices(
            np.arange(number_of_subjects), k=number_of_subjects)

        # For each subject in our bootstrap sample gather all relevant information
        for i in range(len(subj_num_with_replacement)):
            subj_idx = np.where(
                data[:, subject_id_column] == subj_num_with_replacement[i])

            # Recreate a new net effect cluster since this will need to be unique in the data matrix
            # (by repeatedly sampling subjects we could be repeating the net effect clusters)
            cluster_vector = data[subj_idx, net_effect_clusters_column]
            cluster_numbers = np.unique[cluster_vector]
            for j in range(len(cluster_numbers)):
                target_idx = np.where(
                    data[subj_idx,
                         net_effect_clusters_column] == cluster_numbers[j])
                cluster_vector[target_idx] = new_cluster_count
                new_cluster_count += 1

            # Recreate a new subject id
            # (by repeatedly sampling subjects we could be repeating the subject id's)
            # Gather all information into a bootstrap_data matrix
            bootstrap_data.append(
                np.concatenate(
                    np.repmat(new_subject_count, len(subj_idx), 1),
                    data[subj_idx,
                         trials_column:dependent_var_column], cluster_vector))
            new_subject_count += 1

        # Perform some sanity checks to ensure that the bootstrap_data matrix is similar to the actual data matrix
        if not np.all(np.shape(bootstrap_data) == np.shape(data)):
            raise ValueError(
                'Size of bootstrap dataset NOT the same as original data!')
        if not (len(np.unique(data[:, net_effect_clusters_column])) == len(
                np.unique(bootstrap_data[:, net_effect_clusters_column]))):
            raise ValueError(
                'The number of clusters are not the same in the original and bootstrap sample!'
            )
        if not np.array_equal(data[:, subject_id_column],
                              bootstrap_data[:, subject_id_column]):
            raise ValueError(
                'The ordering of subjects are not the same in the original and bootstrap sample!'
            )

        # Store away the bootstrap sample subject information for future reference
        analysis_settings['bootstrap_run_subj_id'] = subj_num_with_replacement
        data = bootstrap_data

    # Checks if analysis will be performed for a specific category; if not then set to [] i.e. NOT category specific
    if not ('category' in analysis_settings):
        analysis_settings.category = []
        print(
            'Missing category specific analyses information! We are going to ignore the category dimension i.e. all '
            'trials from all categories will be analysed')

        # If this analysis is to be performed for a specific category then filters out data from other irrelevant categories
    if len(analysis_settings['category']) > 0:
        target_cat_idx = []
        data_cat = np.unique(data[:, category_column])
        for c in range(len(analysis_settings['category'])):
            cat_exist = np.where(
                data_cat == analysis_settings['category'][c])[0]
            if cat_exist.size == 0:
                raise ValueError(
                    'Category does not exist! You have set analysis_settings.category[{}]={}'
                    .format(c, analysis_settings['category'][c]))
            target_cat_idx = np.concatenate(
                target_cat_idx,
                np.where(data[:, category_column] ==
                         analysis_settings['category'][c])[0])
        data = data[target_cat_idx, :]

    # Checks if outliers (i.e. data trials) will need to dropped; if not specified then set to 'DO NOT DROP OUTLIERS'
    if not ('drop_outliers' in analysis_settings):
        analysis_settings['drop_outliers'] = 3
        print(
            'Missing drop_outliers specific information! We are dropping outliers that are {} standard deviations away from the group mean'
            .format(analysis_settings['drop_outliers']))

    # If this analysis requires the outliers dropped, then drops the data trials within std devs from the GROUP MEAN
    if analysis_settings['drop_outliers'] > 0:
        # NaN's do not qualify as outliers so we filter them out and add them at the end of this step
        nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))

        # NaN free data
        nan_free_data = data[nan_free_idx, :]
        std_dev_predictor_var = np.std(
            nan_free_data[:, predictor_var_column],
            ddof=1) * analysis_settings['drop_outliers']
        mean_predictor_var = np.mean(nan_free_data[:, predictor_var_column])
        predictor_var_idx = (nan_free_data[:, predictor_var_column] >
                             (mean_predictor_var - std_dev_predictor_var)) & (
                                 nan_free_data[:, predictor_var_column] <
                                 (mean_predictor_var + std_dev_predictor_var))
        print(
            '{} trials are dropped since they are regarded as outliers'.format(
                np.shape(nan_free_data)[subject_id_column] -
                np.sum(predictor_var_idx)))
        nan_free_data_outlier_dropped = nan_free_data[predictor_var_idx, :]

        # NaN's trials
        nan_data = data[np.logical_not(nan_free_idx), :]

        # Combine the NaN data with the outlier free data
        data = np.concatenate(
            nan_free_data_outlier_dropped, nan_data
        ) if np.shape(nan_data)[0] > 0 else nan_free_data_outlier_dropped

    # Following the 'filter by category' and 'drop outliers', if applicable, we check if the data matrix is empty
    number_of_trials = np.shape(data)[subject_id_column]
    if number_of_trials <= 0:
        raise ValueError('No input data!')

    # Checks if we need to zscore predictor var within subjects, if not specified then it is set to default of FALSE
    if not ('zscore_within_subjects' in analysis_settings):
        analysis_settings['zscore_within_subjects'] = 0
        print(
            'Missing zscore_within_subjects information! We are NOT zscoring within subjects'
        )

    # Verifies if zscore within subjects is boolean
    if not (type(analysis_settings['zscore_within_subjects']) == bool):
        raise ValueError(
            'zscore_within_subjects field will need to be boolean!')

    # Zscore the predictor variable within each subject
    if analysis_settings['zscore_within_subjects']:
        # NaN's do not qualify to be zscored
        nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))
        # NaN free data
        nan_free_data = data[nan_free_idx, :]
        # Get the list of subject id's (we use this cell array in zscoring the data within each subject, if applicable)
        subject_id_list = np.unique(nan_free_data[:, subject_id_column])
        # We get the number of subjects
        number_of_subjects = len(subject_id_list)
        if number_of_subjects <= 0:
            raise ValueError('Not valid number of subjects!')
        for s in range(number_of_subjects):
            subject_idx = np.where(
                nan_free_data[:, subject_id_column] == subject_id_list[s])[0]
            nan_free_data[subject_idx, predictor_var_column] = stats.zscore(
                nan_free_data[subject_idx, predictor_var_column], ddof=1)
        print('Predictor variables within each subject are zscored!')
        # NaN's trials
        nan_data = data[np.logical_not(nan_free_idx), :]
        # Combine the NaN data with the outlier free data
        data = np.concatenate(
            nan_free_data,
            nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data

    # Checks if resolution is specified, if not specified then set to default of 4. This translates to 1e-4 = 0.0001
    if (not ('resolution'
             in analysis_settings)) or (analysis_settings['resolution'] <= 0):
        analysis_settings['resolution'] = 4
        print('Missing resolution! It is set to a default of %d'.format(
            analysis_settings['resolution']))

    # if we have normally distributed data, we want to z-score the dependent variable
    if analysis_settings['distribution'] == 'normal':
        data[:,
             dependent_var_column] = stats.zscore(data[:,
                                                       dependent_var_column],
                                                  ddof=1)

    # We scale the predictor var to be between 0 and 1 and round it to 4 digits
    nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column]))
    nan_free_data = data[nan_free_idx, :]
    nan_free_data[:, predictor_var_column] = np.round(
        scale_data(nan_free_data[:, predictor_var_column], 0, 1),
        analysis_settings['resolution'])
    nan_data = data[np.logical_not(nan_free_idx), :]
    data = np.concatenate(
        nan_free_data,
        nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data

    # Scrambling the data matrix
    if analysis_settings['scramble']:
        if (not ('scramble_run' in analysis_settings)) or (
                not analysis_settings['scramble_run']):
            raise ValueError(
                'Missing scramble sample number! set analysis_settings.scramble_run to a valid sample number'
            )
        if (not ('scramble_style' in analysis_settings)) or (
                not analysis_settings['scramble_style']):
            analysis_settings[
                'scramble_style'] = 'within_subjects_within_categories'  # most conservative of all scramble techniques
            print('Missing scramble style! It is set a default of {}'.format(
                analysis_settings['scramble_style']))

        # We get the list of subject id's
        subject_id_list = np.unique(data[:, subject_id_column])
        # We get the number of subjects in this analysis
        number_of_subjects = len(subject_id_list)
        if number_of_subjects <= 0:
            raise ValueError('Not valid number of subjects!')

        if analysis_settings[
                'scramble_style'] == 'within_subjects_within_categories':
            # Here scramble all DVs WHILE respecting the net effect boundaries, subject groupings and category groupings
            categories = np.unique(data[:, category_column])
            for s in range(number_of_subjects):
                for c in range(len(categories)):
                    subject_category_idx = np.where(
                        (data[:, subject_id_column] == subject_id_list[s])
                        & (data[:, category_column] == categories[c]))[0]
                    if len(subject_category_idx) > 1:
                        data[
                            subject_category_idx,
                            dependent_var_column] = scramble_dependent_variable(
                                data[subject_category_idx,
                                     dependent_var_column],
                                data[subject_category_idx,
                                     net_effect_clusters_column])

        elif analysis_settings[
                'scramble_style'] == 'within_subjects_across_categories':
            # Here we scramble all dependent variables WHILE respecting the net effect boundaries and subject groupings
            for s in range(number_of_subjects):
                subject_idx = np.where(
                    data[:, subject_id_column] == subject_id_list[s])[0]
                if len(subject_idx) > 1:
                    data[subject_idx,
                         dependent_var_column] = scramble_dependent_variable(
                             data[subject_idx, dependent_var_column],
                             data[subject_idx, net_effect_clusters_column])

        elif analysis_settings[
                'scramble_style'] == 'across_subjects_across_categories':
            # Here we scramble all dependent variables WHILE respecting the net effect boundaries
            all_idx = np.arange(np.shape(data)[0])
            if len(all_idx) > 1:
                data[all_idx,
                     dependent_var_column] = scramble_dependent_variable(
                         data[all_idx, dependent_var_column],
                         data[all_idx, net_effect_clusters_column])

        else:
            raise ValueError(
                'Invalid analysis_settings.scramble_style={}'.format(
                    analysis_settings['scramble_style']))

    # Our data matrix looks like data = [subject id, item, category, predictor var, dependent var, net effect cluster]
    # We verify if the subject id and dependent var columns are unique for the net effect clusters
    # Below is a example of a valid data matrix (note dependent variable is unique within net effect cluster 111)
    # data(1, :) = [24, 1, 1, 0.3333, 0, 111]
    # data(2, :) = [24, 2, 2, 0.2222, 0, 111]
    # data(3, :) = [24, 3, 1, 0.4444, 0, 111]
    # Below is a example of an invalid data matrix (note dependent variable is not unique within net effect cluster 111)
    # data(1, :) = [24, 1, 1, 0.3333, 0, 111]
    # data(2, :) = [24, 2, 2, 0.2222, 1, 111]
    # data(3, :) = [24, 3, 1, 0.4444, 0, 111]

    # Fetching the net effect clusters
    net_effect_clusters = np.unique(data[:, net_effect_clusters_column])
    analysis_settings['net_effect_clusters'] = net_effect_clusters

    # If net effect clusters exist verify if the Subject Id and dependent variable are unique for those clusters
    if len(net_effect_clusters) != np.shape(data)[0]:
        for i in range(len(net_effect_clusters)):
            cluster_idx = np.where(
                data[:,
                     net_effect_clusters_column] == net_effect_clusters[i])[0]
            if len(
                    np.shape(
                        np.unique(
                            data[cluster_idx,
                                 [subject_id_column, dependent_var_column]],
                            axis=0))) != 1:
                raise ValueError(
                    'Subject Id and/or dependent variable not unique for net effect cluster {}! Check '
                    'the data matrix'.format(net_effect_clusters[i]))
    else:
        # If net effect clusters DO NOT exist then we treat each row as a net effect cluster by itself
        print(
            'Each row will be treated separately. We will NOT be computing the net effect of any rows'
        )

    # We create an analysis id unique to this analysis
    if (not ('analysis_id'
             in analysis_settings)) or (not analysis_settings['analysis_id']):
        time = datetime.datetime.now()
        analysis_settings['analysis_id'] = '{}-{}-{}-{}-{}'.format(
            time.month, time.day, time.hour, time.minute, time.second)

    # We create a results directory if no specific target directory is mentioned
    if (not ('target_dir'
             in analysis_settings)) or (not analysis_settings['target_dir']):
        results_dir = os.path.join(os.getcwd(), 'results')
        if not os.path.isdir(results_dir):
            os.mkdir(results_dir)
        analysis_settings['target_dir'] = results_dir

    # target_directory = 'results/analysis_id'
    analysis_settings['target_dir'] = os.path.join(
        analysis_settings['target_dir'], analysis_settings['analysis_id'])
    if not os.path.isdir(analysis_settings['target_dir']):
        os.mkdir(analysis_settings['target_dir'])

    # Due to memory constraints we perform two chunking tricks

    # Chunking trick I
    # In the curve fitting algorithm we need to compute the p(current iteration curves | previous
    # iteration curves). This matrix is huge when the number of particles (curves) is large, say 100,000. Even with a
    # 8 Gb RAM, dedicated to Matlab, we still get a out of memory errors. To avoid this problem we chunk the matrix
    # into smaller, more manageable matrices. Setting the chunk size to be particles x 0.05 -> 100,000 x 0.05 = 5000,
    # translates to p(current iteration curves(5000 curves at a time) | previous iteration curves).
    analysis_settings['wgt_chunks'] = analysis_settings['particles'] * 0.05
    # If the chunk size is less then 5000 we set it be the number of particles itself
    if analysis_settings['wgt_chunks'] < 5000:
        analysis_settings['wgt_chunks'] = analysis_settings['particles']

    # Chunking trick II
    if not ('particle_chunks' in analysis_settings):
        analysis_settings['particle_chunks'] = 2
        print('Missing particle chunks! It is set to a default of {}'.format(
            analysis_settings['particle_chunks']))

    # Depending on the number of particle chunks we get start, end points and the number of particles within each chunk.
    # For instance 1000 particles divided into 4 chunks will look like,
    # | 0   | 250 | 250
    # | 250	| 500 | 250
    # | 500 | 750 | 250
    # | 750 | 1000| 250
    dummy = np.arange(
        0, analysis_settings['particles'],
        analysis_settings['particles'] / analysis_settings['particle_chunks'])
    analysis_settings['ptl_chunk_idx'] = np.stack(
        (dummy, dummy +
         analysis_settings['particles'] / analysis_settings['particle_chunks'],
         np.full(
             np.shape(dummy), analysis_settings['particles'] /
             analysis_settings['particle_chunks'])),
        axis=1)

    # Storing analysis relevant information into the analysis_settings struct
    # We get the list of subject id's
    subject_id_list = np.unique(data[:, subject_id_column])

    # We get the number of subjects in this analysis
    analysis_settings['nSubjs'] = len(subject_id_list)
    if analysis_settings['nSubjs'] <= 0:
        raise ValueError('Not valid number of subjects!')

    print('********** END OF MESSAGES **********')
    return data, analysis_settings
Exemplo n.º 8
0
def importance_sampler(raw_data, analysis_settings):
    """
   Recovers a curve that best explains the relationship between the predictor and dependent variables
   
   **Arguments**:
   - raw_data: The data matrix (total number of trials x 6 columns). Refer to RUN_IMPORTANCE_SAMPLER()
   - analysis_settings: A struct that holds algorithm relevant settings. Refer to RUN_IMPORTANCE_SAMPLER()

    Saves a .mat file in `current_path/analysis_id/analysis_id_importance_sampler.mat`
   """

    time = datetime.datetime.now()
    print('Start time {}/{} {}:{}'.format(time.month, time.day, time.hour,
                                          time.minute))

    # Resetting the random number seed
    random.seed()
    seed = random.getstate()

    # Preprocessing the data matrix and updating the analysis_settings struct with additional/missing information
    preprocessed_data, ana_opt = preprocessing_setup(raw_data,
                                                     analysis_settings)
    del raw_data
    del analysis_settings

    # Housekeeping
    importance_sampler = {}  # Creating the output struct
    hold_betas_per_iter = np.full(
        (ana_opt['em_iterations'] + 1, 2),
        np.nan)  # Matrix to hold betas over em iterations
    exp_max_f_values = np.full(
        (ana_opt['em_iterations'], 1),
        np.nan)  # Matrix to hold the f_values over em iterations
    normalized_w = np.full(
        (ana_opt['em_iterations'] + 1, ana_opt['particles']),
        np.nan)  # to hold the normalized weights

    global tau
    global bounds
    global w
    global net_effects
    global dependent_var

    # fetch parameters
    tau = ana_opt['tau']  # Store the tau for convenience
    bounds = family_of_curves(
        ana_opt['curve_type'],
        'get_bounds')  # Get the curve parameter absolute bounds
    nParam = family_of_curves(
        ana_opt['curve_type'],
        'get_nParams')  # Get the number of curve parameters
    hold_betas = [ana_opt['beta_0'],
                  ana_opt['beta_1']]  # Store the betas into a vector

    for em in range(ana_opt['em_iterations']):  # for every em iteration
        hold_betas_per_iter[
            em, :] = hold_betas  # Store the logreg betas over em iterations
        print('Betas: {}, {}'.format(hold_betas[0], hold_betas[1]))
        print('EM Iteration: {}'.format(em))

        # Initialize the previous iteration curve parameters, weight vector, net_effects and dependent_var matrices
        # Matrix to hold the previous iteration curve parameters
        prev_iter_curve_param = np.full(
            (ana_opt['particles'],
             family_of_curves(ana_opt['curve_type'], 'get_nParams')), np.nan)
        w = np.full((ana_opt['particles']),
                    np.nan)  # Vector to hold normalized weights

        # Matrix to hold the predictor variables (taking net effects if relevant) over all particles
        net_effects = np.full(
            (len(ana_opt['net_effect_clusters']), ana_opt['particles']),
            np.nan)
        dependent_var = np.array(
            []
        )  # can't be initialized in advance as we don't know its length (dropping outliers)

        # Sampling curve parameters
        if em == 0:  # only for the first em iteration
            param = common_to_all_curves(
                ana_opt['curve_type'], 'initial_sampling',
                ana_opt['particles'],
                ana_opt['resolution'])  # Good old uniform sampling
        else:  # for em iterations 2, 3, etc
            # Sample curve parameters from previous iteration's curve parameters based on normalized weights
            prev_iter_curve_param = param  # we need previous iteration's curve parameters to compute likelihood

            # Here we sample curves (with repetitions) based on the weights
            param = prev_iter_curve_param[
                random.choices(np.arange(ana_opt['particles']),
                               k=ana_opt['particles'],
                               weights=normalized_w[em - 1, :]), :]
            # Add Gaussian noise since some curves are going to be identical due to the repetitions
            # NOISE: Sample from truncated normal distribution using individual curve parameter bounds,
            # mean = sampled curve parameters and sigma = tau
            for npm in range(nParam):
                param[:, npm] = truncated_normal(bounds[npm, 0],
                                                 bounds[npm, 1], param[:, npm],
                                                 tau, ana_opt['particles'])

        # Check whether curve parameters lie within the upper and lower bounds
        param = common_to_all_curves(ana_opt['curve_type'],
                                     'check_if_exceed_bounds', param)
        if ana_opt['curve_type'] == 'horz_indpnt':
            # Check if the horizontal curve parameters are following the right trend i.e. x1 < x2
            param = common_to_all_curves(ana_opt['curve_type'],
                                         'sort_horizontal_params', param)

            # Compute the likelihood over all subjects (i.e. log probability mass function if logistic regression)
        #  This is where we use the chunking trick II
        for ptl_idx in range(np.shape(ana_opt['ptl_chunk_idx'])[0]):
            output_struct = family_of_curves(
                ana_opt['curve_type'], 'compute_likelihood',
                ana_opt['net_effect_clusters'],
                ana_opt['ptl_chunk_idx'][ptl_idx, 2],
                param[int(ana_opt['ptl_chunk_idx'][
                    ptl_idx, 0]):int(ana_opt['ptl_chunk_idx'][ptl_idx, 1]), :],
                hold_betas, preprocessed_data, ana_opt['distribution'],
                ana_opt['dist_specific_params'],
                ana_opt['data_matrix_columns'])

            # Gather weights
            w[int(ana_opt['ptl_chunk_idx'][ptl_idx,
                                           0]):int(ana_opt['ptl_chunk_idx'][
                                               ptl_idx,
                                               1])] = output_struct['w']

            # Gather predictor variable
            net_effects[:, int(ana_opt['ptl_chunk_idx'][ptl_idx, 0]):int(ana_opt['ptl_chunk_idx'][ptl_idx, 1])] = \
                  output_struct['net_effects']
            if ptl_idx == 0:
                # Gather dependent variable only once, since it is the same across all ptl_idx
                dependent_var = output_struct['dependent_var']

        del output_struct
        if np.any(np.isnan(w)):
            raise ValueError('NaNs in normalized weight vector w!')

        # Compute the p(theta) and q(theta) weights
        if em > 0:
            p_theta_minus_q_theta = compute_weights(
                ana_opt['curve_type'], ana_opt['particles'],
                normalized_w[em - 1, :], prev_iter_curve_param, param,
                ana_opt['wgt_chunks'], ana_opt['resolution'])
            w += p_theta_minus_q_theta

        w = np.exp(
            w - special.logsumexp(w)
        )  # Normalize the weights using logsumexp to avoid numerical underflow
        normalized_w[em, :] = w  # Store the normalized weights

        # Optimize betas using fminunc
        optimizing_function = family_of_distributions(
            ana_opt['distribution'], 'fminunc_both_betas', w, net_effects,
            dependent_var, ana_opt['dist_specific_params'])

        result = optimize.minimize(optimizing_function,
                                   np.array(hold_betas),
                                   jac=True,
                                   options={
                                       'disp': True,
                                       'return_all': True
                                   })
        hold_betas = result.x
        f_value = result.fun

        exp_max_f_values[
            em] = f_value  # gather the f_values over em iterations

    hold_betas_per_iter[
        em + 1, :] = hold_betas  # Store away the last em iteration betas
    print('>>>>>>>>> Final Betas: {}, {} <<<<<<<<<'.format(
        hold_betas[0], hold_betas[1]))

    # Flipping the vertical curve parameters if beta_1 is negative
    importance_sampler['flip'] = False
    neg_beta_idx = hold_betas[1] < 0
    if neg_beta_idx:
        print('!!!!!!!!!!!!!!!!!!!! Beta 1 is flipped !!!!!!!!!!!!!!!!!!!!')
        hold_betas[1] = hold_betas[1] * -1
        param = common_to_all_curves(ana_opt['curve_type'],
                                     'flip_vertical_params', param)
        importance_sampler['flip'] = True

    w = np.full((ana_opt['particles']), np.nan)  # Clearing the weight vector

    # Used for a likelihoods ratio test to see if our beta1 value is degenerate
    w_null_hypothesis = np.full((ana_opt['particles']), np.nan)

    # The null hypothesis for the likelihoods ratio test states that our model y_hat = beta_0 + beta_1 * predictor
    # variable is no different than the simpler model y_hat = beta_0 + beta_1 * predictor variable WHERE BETA_1 =
    # ZERO i.e. our model is y_hat = beta_0
    null_hypothesis_beta = [hold_betas[0], 0]

    for ptl_idx in range(np.shape(ana_opt.ptl_chunk_idx)[0]):
        output_struct = family_of_curves(
            ana_opt['curve_type'], 'compute_likelihood',
            ana_opt['net_effect_clusters'], ana_opt['ptl_chunk_idx'][ptl_idx,
                                                                     3],
            param[ana_opt['ptl_chunk_idx'][ptl_idx,
                                           1]:ana_opt['ptl_chunk_idx'][ptl_idx,
                                                                       2], :],
            hold_betas, preprocessed_data, ana_opt['distribution'],
            ana_opt['dist_specific_params'], ana_opt['data_matrix_columns'])
        w[ana_opt['ptl_chunk_idx'][ptl_idx, 1]:ana_opt['ptl_chunk_idx'][
            ptl_idx, 2]] = output_struct['w']

    # this code computes the log likelihood of the data under the null hypothesis i.e. using null_hypothesis_beta
    # instead of hold_betas -- it's "lazy" because, unlike the alternative hypothesis, we don't have to compute the
    # data likelihood for each particle because it's exactly the same for each particle (b/c compute_likelihood uses
    # z = beta_1 * x + beta_0, but (recall that our particles control the value of x in this equation) beta_1 is zero
    # for the null hypothesis) that's why we pass in the zero vector representing a single particle with irrelevant
    # weights so we don't have to do it for each particle unnecessarily
    output_struct_null_hypothesis_lazy = family_of_curves(
        ana_opt['curve_type'], 'compute_likelihood',
        ana_opt['net_effect_clusters'], 1, [0, 0, 0, 0, 0, 0],
        null_hypothesis_beta, preprocessed_data, ana_opt['distribution'],
        ana_opt['dist_specific_params'], ana_opt['data_matrix_columns'])
    data_likelihood_null_hypothesis = output_struct_null_hypothesis_lazy['w']
    data_likelihood_alternative_hypothesis = w

    w = w + p_theta_minus_q_theta
    if np.any(np.isnan(w)):
        raise ValueError('NaNs in normalized weight vector w!')

    w = np.exp(
        w - special.logsumexp(w)
    )  # Normalize the weights using logsumexp to avoid numerical underflow
    normalized_w[em + 1, :] = w  # Store the normalized weights

    # Added for debugging chi-sq, might remove eventually
    importance_sampler[
        'data_likelihood_alternative_hypothesis'] = data_likelihood_alternative_hypothesis
    importance_sampler[
        'data_likelihood_null_hypothesis'] = data_likelihood_null_hypothesis

    # we calculate the data_likelihood over ALL particles by multiplying the data_likelihood for each particle by
    # that particle's importance weight
    dummy_var, importance_sampler['likratiotest'] = likratiotest(
        w * np.transpose(data_likelihood_alternative_hypothesis),
        data_likelihood_null_hypothesis, 2, 1)

    if np.any(np.isnan(normalized_w)):
        raise ValueError('NaNs in normalized weights vector!')
    if np.any(np.isnan(exp_max_f_values)):
        raise ValueError('NaNs in Expectation maximilzation fval matrix!')
    if np.any(np.isnan(hold_betas_per_iter)):
        raise ValueError('NaNs in hold betas matrix!')

    importance_sampler['normalized_weights'] = normalized_w
    importance_sampler['exp_max_fval'] = exp_max_f_values
    importance_sampler['hold_betas_per_iter'] = hold_betas_per_iter
    importance_sampler['curve_params'] = param
    importance_sampler['analysis_settings'] = ana_opt

    if ana_opt['bootstrap']:
        sio.savemat(
            '{}/{}_b{}_importance_sampler.mat'.format(
                ana_opt['target_dir'], ana_opt['analysis_id'],
                ana_opt['bootstrap_run']),
            {'importance_sampler': importance_sampler})
    elif ana_opt['scramble']:
        sio.savemat(
            '{}/{}_s{}_importance_sampler.mat'.format(ana_opt['target_dir'],
                                                      ana_opt['analysis_id'],
                                                      ana_opt['scramble_run']),
            {'importance_sampler': importance_sampler})
    else:
        sio.savemat(
            '{}/{}_importance_sampler.mat'.format(ana_opt['target_dir'],
                                                  ana_opt['analysis_id']),
            {'importance_sampler': importance_sampler})
    print('Results are stored in be stored in {}'.format(
        ana_opt['target_dir']))

    time = datetime.datetime.now()
    print('Finish time {}/{} {}:{}'.format(time.month, time.day, time.hour,
                                           time.minute))
Exemplo n.º 9
0
def compute_weights(curve_name, nParticles, normalized_w,
                    prev_iter_curve_param, param, wgt_chunks, resolution):
    """
   Computes (P_theta - Q_theta)

   **Arguments**:  
   - curve_name: Name of the family of curves (explicitly passed in)
   - nParticles: Number of particles to be used (explicitly passed in)
   - normalized_w: Previous iteration's normalized weights
   - prev_iter_curve_param: Curve parameters held for the previous iteration
   - param: Curve parameters held for the current iteration
   - wgt_chunks: Size of chunk. To deal with limited RAM we break up matrix into smaller matrices
   - resolution: Resolution to which the activations are rounded of
   
   **Returns** p_theta_minus_q_theta: Vector of length P (particles)
   """
    global which_param
    total_vol = common_to_all_curves(
        curve_name, 'curve_volumes',
        resolution)  # Get the curve volumes (Lesbesgue measure)
    nParam = family_of_curves(
        curve_name, 'get_nParams')  # Get the number of curve parameters

    # Computing q(theta), i.e. what is the probability of a curve given all curves from the previous iteration
    # P(theta|old_theta)
    q_theta = np.zeros((nParticles, 1))
    reduced_nParticles = int(nParticles / wgt_chunks)
    reduced_nParticles_idx = np.vstack(
        (np.arange(0, nParticles, reduced_nParticles),
         np.arange(0, nParticles, reduced_nParticles) + reduced_nParticles))

    print(datetime.datetime.now())
    for idx in range(np.shape(reduced_nParticles_idx)[1]):
        prob_grp_lvl_curve = np.zeros((nParticles, reduced_nParticles))
        target_indices = np.arange(reduced_nParticles_idx[0, idx],
                                   reduced_nParticles_idx[1, idx])
        for npm in range(nParam):
            which_param = npm
            nth_grp_lvl_param = np.tile(param[:, npm].reshape(-1, 1),
                                        (1, reduced_nParticles))
            nth_prev_iter_curve_param = prev_iter_curve_param[target_indices,
                                                              npm]
            trunc_likes = compute_trunc_likes(nth_grp_lvl_param,
                                              nth_prev_iter_curve_param, tau,
                                              bounds, which_param)
            prob_grp_lvl_curve = np.add(prob_grp_lvl_curve, trunc_likes)

            if np.any(np.isnan(prob_grp_lvl_curve)):
                raise ValueError(
                    'NaNs in probability of group level curves matrix!')

        q_theta = np.add(
            q_theta,
            np.exp(prob_grp_lvl_curve) * normalized_w[target_indices])

    if np.any(np.isnan(q_theta)):
        raise ValueError('NaNs in q_theta vector!')

    # Computing p(theta) prior i.e. what is the probability of a curve in the curve space
    p_theta = np.ones((nParticles, 1))
    p_theta = np.multiply(p_theta, (1 / total_vol))
    if len(np.unique(p_theta)) != 1:
        raise ValueError('p_theta is NOT unique!')
    if np.any(np.isnan(p_theta)):
        raise ValueError('NaNs in p_theta vector!')

    p_theta_minus_q_theta = np.transpose(np.log(p_theta)) - np.transpose(
        np.log(q_theta))
    return p_theta_minus_q_theta
Exemplo n.º 10
0
def auto_generate(curve_type, input_params, resolution):
    """Generate 100 curves and randomly pick a theory consistent or 
    inconsistent curve depending on the request.  
    
    If you passed in the `input_params` as `con` then it randomly draws a 
    theory consistent curve; `inc` - theory inconsistent curve. 
    """
    if resolution <= 0:
        raise ValueError('Resolution will need to > 0!')

    nSamples = 100
    nParam = family_of_curves(curve_type, 'get_nParams')
    params = np.full((nSamples, nParam), np.nan)
    out = np.full((nParam), np.nan)

    # Generate 100 curves and randomly pick a theory consistent or inconsistent curve depending on the request
    params = common_to_all_curves(curve_type, 'initial_sampling', nSamples,
                                  resolution)
    if curve_type == 'horz_indpnt':  # Enforce the right ordering for the horizontal curve parameters i.e. x1 < x2
        params = common_to_all_curves(curve_type, 'sort_horizontal_params',
                                      params)

    if np.any(np.isnan(params)):
        raise ValueError('NaNs in curve parameter matrix!')
    params_indices = family_of_curves(curve_type, 'count_particles', params)

    if input_params == 'con':
        th_con_params_indices = np.where(
            params_indices != 0)  # Finding the theory consistent trial indices
        if len(th_con_params_indices) <= 0:
            raise ValueError('Did not generate any theory consistent indices!')

        # Randomly permuting the th_con trial indices
        th_con_params_indices = th_con_params_indices[np.random.permutation(
            np.shape(th_con_params_indices)[0])]
        out = params[
            th_con_params_indices[0], :]  # picking one consistent particle

    elif input_params == 'inc':
        th_inc_params_indices = np.where(np.logical_not(
            params_indices))  # Finding theory inconsistent trial indices
        if len(th_inc_params_indices) <= 0:
            raise ValueError(
                'Did not generate any theory inconsistent indices!')

        # Randomly permuting the th_inc trial indices
        th_inc_params_indices = th_inc_params_indices[np.random.permutation(
            np.shape(th_inc_params_indices)[0])]
        out = params[
            th_inc_params_indices[0], :]  # picking one inconsistent particle
    else:
        raise ValueError('Invalid string! valid ones include '
                         'con'
                         ' or '
                         'inc'
                         ' only')

    if np.any(np.isnan(out)):
        raise ValueError('NaNs in curve parameters!')

    return out