def regional_linregress(df, x, aparc_names):
    '''
    regional_linregress
    
    INPUTS: 
        df ------------- pandas data frame
        x -------------- independent variable name (must be column in df)
        aparc_names ---- list of variable names (columns in df) to loop
                           through as dependent variables for the regression
                           
    RETURNS:
        m_array -------------- numpy array containing slopes for each region
        c_array -------------- numpy array containing intercepts (at 0) for each region
        r_array -------------- numpy array containing pearson r values for each region
        p_array -------------- numpy array containing raw p values for each region
        p_fdr_array ---------- numpy array containing fdr corrected p values for each region
        m_masked_array ------- numpy array containing the slope values for regions which
                                 are indivudially significant otherwise -99 markers
        m_fdr_masked_array --- numpy array containing the slope values for regions which
                                 pass fdr correction otherwise -99 markers
    '''
    
    # Import what you need
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr
    import numpy as np
    from scipy.stats import linregress
    
    # Set up some empty arrays
    # to contain the slope of the regression line (m)
    # the intercept at x = 0 (c)
    # the list of raw p values (p)
    # and the r values (r) for each region.
    m_array = np.ones(len(aparc_names))
    c_array = np.ones(len(aparc_names))
    p_array = np.ones(len(aparc_names))
    r_array = np.ones(len(aparc_names))

    # Loop through all the regions and record m, p and r for each region
    for i, roi in enumerate(aparc_names):
        m, c, r, p, std_err = linregress(df[x].values,
                                            df[roi].values)
        m_array[i] = m
        c_array[i] = c
        r_array[i] = r    
        p_array[i] = p
        
    # Calculate the fdr p values
    p_fdr_array = fdr(p_array)[1]
    p_fdr_mask = fdr(p_array)[0]
    
    # Create two masked versions of the slope array
    m_masked_array = np.copy(m_array)
    m_masked_array[p_array>0.05] = -99
    
    m_fdr_masked_array = np.copy(m_array)
    m_fdr_masked_array[p_fdr_array>0.05] = -99
    
    # Return the arrays
    return m_array, c_array, r_array, p_array, p_fdr_array, m_masked_array, m_fdr_masked_array
def regional_ttest(df, cols_list):

    # Create a list of t and p values
    # and the mean and standard deviations
    # for each region
    t_list = []
    p_list = []
    mean_list = []
    std_list = []
    stars_list = []

    # Now loop through these regions
    for col in cols_list:

        # Save the mean and standard deviation values
        mean_list += [df.loc[df[col].notnull(), col].mean()]
        std_list += [df.loc[df[col].notnull(), col].std()]

        # Conduct the t-test regionally
        t, p = ttest_1samp(df.loc[df[col].notnull(), col], 0)
        t_list += [t]
        p_list += [p]

        # Get a "star" value for this test so you can print it nicely
        # NOTE that these are not corrected
        star = 'ns'
        if p < 0.05:
            star = '*'
        if p < 0.01:
            star = '**'
        if p < 0.001:
            star = '***'

        stars_list += [star]

    # Calculate the fdr corrected p values
    fdr_mask, fdr_ps = fdr(np.array(p_list))

    # Turn these values into a dictionary
    ttest_dict = {
        'regions': cols_list,
        'means': np.array(mean_list),
        'stds': np.array(std_list),
        'ts': np.array(t_list),
        'ps': np.array(p_list),
        'fdr_ps': np.array(fdr_ps),
        'stars': np.array(stars_list)
    }

    return ttest_dict
示例#3
0
print time() - start_time

out = open('condensed_corr.pkl', 'wb')
dump(condensed_corr, out)
out.close()
out = open('condensed_pval.pkl', 'wb')
dump(condensed_pval, out)
out.close()

print "\t**Loading condensed matrices..."
condensed_corr = load( open( 'condensed_corr.pkl' ) )
condensed_pval = load( open( 'condensed_pval.pkl' ) )
print "\t... done!\n"

print "\t**Correcting p-values..."
(rejecteds, corrected_pval) = fdr(condensed_pval, alpha=0.01)
print "\t... done!\n"

print "\t**Organizing condensed matrices into something you can understand (you dumbass!)..."
uncorrected_corr_matrix = squareform(condensed_corr)
for n in range(uncorrected_corr_matrix.shape[0]):
    uncorrected_corr_matrix[n][n] = 1.
uncorrected_corr_df = pd.DataFrame(index=all_species_distances.index, columns=all_species_distances.index, data=uncorrected_corr_matrix)

rejecteds_df = pd.DataFrame(index=all_species_distances.index, columns=all_species_distances.index, data=squareform(rejecteds) > 0)
corr_df = uncorrected_corr_df[rejecteds_df]
corr_df.to_csv('significant_group_correlations-based_on_distances-bak.tab', sep='\t')
print "\t... done!\n"

#
# network time!
def regional_linregress(df,
                        x,
                        names,
                        covars=[],
                        n_perm=1000,
                        categorical=False):
    '''
    A function that calls a multiple regression model repeatedly for
    all variable names passed (as names) in the data frame as the dependent
    variable, with the x column as the independent variable and the 
    names in covars as covariates.
    
    INPUTS: 
        df ------------- pandas data frame
        x -------------- independent variable name (must be column in df)
        names ---------- list of variable names (columns in df) to loop
                           through as dependent variables (ys) for the regression
        covars --------- list containing variable names that should be controlled for
                           (must be columns in df and the same for all 
                           dependent variables)
                           Default value: [ ]
        n_perm --------- number of permutations for permutation testing
                           Default value: 1000
    
    RETURNS:
        m_array ------------------ numpy array containing slopes for each region
        c_array ------------------ numpy array containing intercepts (at 0) for each region
        c14_array ---------------- numpy array containing intercepts at 14 for each region
        r_array ------------------ numpy array containing partial r (correcting for covariates) for each region
        p_array ------------------ numpy array containing raw p values for each region
        perm_p_array ------------- numpy array containing raw permutation p values for each region
        p_fdr_array -------------- numpy array containing fdr corrected p values for each region
        perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region
        m_masked_p_array --------- numpy array containing the slope values for regions which
                                     are indivudially significant, otherwise -99 markers
        m_masked_perm_p_array ---- numpy array containing the slope values for regions which
                                     are indivudially significant according to permutation
                                     testing, otherwise -99 markers
        m_fdr_masked_array ------- numpy array containing the slope values for regions which
                                     pass fdr correction otherwise -99 markers
        m_perm_fdr_masked_array -- numpy array containing the slope values for regions which
                                     pass fdr correction according to permutation testing,
                                     otherwise -99 markers
    '''
    #----------------------------------------------------------------
    # Import what you need
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr
    import numpy as np
    from scipy.stats import linregress

    #----------------------------------------------------------------
    # Set up an empty dictionary to save all these values
    regional_linregress_dict = {}

    #----------------------------------------------------------------
    # Set up your covars_list
    # This should contain all the data for each covar as a different
    # element in the list
    # You're going to save the index at which wbic appears in the list
    # because you'll need to add the param for this covar to the
    # intercept value later to get a value for a woman (if male is
    # included in the covariates list) at wbic.
    # (You don't have to correct your intercept for being a woman
    # because the male covariate is coded as 0 for women, but you do
    # have to correct for wbic because there 0 represents CBU)
    covars_list = []
    wbic_i = None

    for i, covar in enumerate(covars):
        covars_list += [df[covar].values]
        if covar == 'wbic':
            wbic_i = i

    #----------------------------------------------------------------
    # Set up some empty arrays to contain:
    #    m: slope of the regression line
    #    c: intercept as x = 0 + the parameter estimate
    #                          for the wbic scanner location if passed
    #    c14: intercept when x = 14 (c + 14 * m)
    #    r: partial r
    #    p: p from the ols regression (for x)
    #    perm_p: p from the permutation test
    m_array = np.ones(len(names))
    c_array = np.ones(len(names))
    c14_array = np.ones(len(names))
    r_array = np.ones(len(names))
    p_array = np.ones(len(names))
    perm_p_array = np.ones(len(names))

    #----------------------------------------------------------------
    # Loop through all the regions and first regress out the
    # covariates and then record m, c, r, p and perm_p
    # for each region
    for i, roi in enumerate(names):

        # Run the permutation test
        linregress_dict = permutation_correlation(df[x].values,
                                                  df[roi].values,
                                                  covars_orig=covars_list,
                                                  n_perm=n_perm)

        # Run the regular ols to get the correct intercept values
        results, c, c14 = ols_correlation(df[x].values,
                                          df[roi].values,
                                          covars=covars_list,
                                          wbic_covars_index=wbic_i)

        # Add these values to the linregress_dict
        # (which means overwriting 'c' and adding in 'c14')
        linregress_dict['c'] = c
        linregress_dict['c14'] = c14

        # Fill up your empty arrays with the useful values
        #== Beta =========
        m_array[i] = results.params['x']

        #== Intercept ====
        c_array[i] = c

        #== Int at 14 ====
        c14_array[i] = c14

        #== Partial r ====
        r_array[i] = linregress_dict['r']

        #== p & perm_p ===
        p_array[i] = linregress_dict['p']
        perm_p_array[i] = linregress_dict['perm_p']

    #----------------------------------------------------------------
    # Calculate the fdr p values
    p_fdr_array = fdr(p_array)[1]
    p_fdr_mask = fdr(p_array)[0]

    perm_p_fdr_array = fdr(perm_p_array)[1]
    perm_p_fdr_mask = fdr(perm_p_array)[0]

    #----------------------------------------------------------------
    # Create masked versions of the slope array
    m_masked_p_array = np.copy(m_array)
    m_masked_p_array[p_array > 0.05] = -99

    m_masked_perm_p_array = np.copy(m_array)
    m_masked_perm_p_array[perm_p_array > 0.05] = -99

    m_masked_p_fdr_array = np.copy(m_array)
    m_masked_p_fdr_array[p_fdr_array > 0.05] = -99

    m_masked_perm_p_fdr_array = np.copy(m_array)
    m_masked_perm_p_fdr_array[perm_p_fdr_array > 0.05] = -99

    #----------------------------------------------------------------
    # Now save each of these arrays into the dictionary
    regional_linregress_dict['m'] = m_array
    regional_linregress_dict['c'] = c_array
    regional_linregress_dict['c14'] = c14_array
    regional_linregress_dict['r'] = r_array
    regional_linregress_dict['p'] = p_array
    regional_linregress_dict['perm_p'] = perm_p_array
    regional_linregress_dict['p_fdr'] = p_fdr_array
    regional_linregress_dict['perm_p_fdr'] = m_array
    regional_linregress_dict['m_masked_p'] = m_masked_p_array
    regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array
    regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array
    regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array

    # Return the regional regression dictionary
    return regional_linregress_dict
def regional_linregress_byregion(df_x,
                                 df_y,
                                 names,
                                 covars=[],
                                 n_perm=1000,
                                 categorical=False):
    '''
    A function that calls a multiple regression model repeatedly for
    each variable name (in names) with the data in df_y as the dependent
    variable, and the data in df_x as the independent variable. Data in 
    df_x named as in covars are passed as covariates.

    
    INPUTS: 
        df_x ----------- pandas data frame containing x axis values
        df_y ----------- pandas data frame containing y axis values
        covars --------- list containing variable names that should be controlled for
                           (must be columns in df_x and the same for all 
                           dependent variables)
                           Default value: [ ]
        names ---------- list of variable names (columns in df_x and df_y)
                           to loop though and conduct pairwise regressions
        n_perm --------- number of permutations for permutation testing
                           Default value: 1000
        categorical ---- boolean indicating whether you want to permute
                           and return the Fstatistic (if True) or the parameter
                           estimate of the x variable (if False)
                           Default value: False
    
    RETURNS:
        m_array ------------------ numpy array containing slopes for each region
        c_array ------------------ numpy array containing intercepts (at 0) for each region
        c14_array ---------------- numpy array containing intercepts at 14 for each region
        r_array ------------------ numpy array containing partial r (correcting for covariates) for each region
        p_array ------------------ numpy array containing raw p values for each region
        perm_p_array ------------- numpy array containing raw permutation p values for each region
        p_fdr_array -------------- numpy array containing fdr corrected p values for each region
        perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region
        m_masked_p_array --------- numpy array containing the slope values for regions which
                                     are indivudially significant, otherwise -99 markers
        m_masked_perm_p_array ---- numpy array containing the slope values for regions which
                                     are indivudially significant according to permutation
                                     testing, otherwise -99 markers
        m_fdr_masked_array ------- numpy array containing the slope values for regions which
                                     pass fdr correction otherwise -99 markers
        m_perm_fdr_masked_array -- numpy array containing the slope values for regions which
                                     pass fdr correction according to permutation testing,
                                     otherwise -99 markers
    '''
    #----------------------------------------------------------------
    # Import what you need
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr
    import numpy as np
    from scipy.stats import linregress

    #----------------------------------------------------------------
    # Set up an empty dictionary to save all these values
    regional_linregress_dict = {}

    #----------------------------------------------------------------
    # Set up your covars_list
    # This should contain all the data for each covar as a different
    # element in the list
    # You're going to save the index at which wbic appears in the list
    # because you'll need to add the param for this covar to the
    # intercept value later to get a value for a woman (if male is
    # included in the covariates list) at wbic.
    # (You don't have to correct your intercept for being a woman
    # because the male covariate is coded as 0 for women, but you do
    # have to correct for wbic because there 0 represents CBU)
    covars_list = []
    wbic_i = None

    for i, covar in enumerate(covars):
        covars_list += [df_x[covar].values]
        if covar == 'wbic':
            wbic_i = i

    #----------------------------------------------------------------
    # Set up some empty arrays to contain:
    #    m: slope of the regression line
    #    c: intercept as x = 0 + the parameter estimate
    #                          for the wbic scanner location if passed
    #    c14: intercept when x = 14 (c + 14 * m)
    #    r: partial r
    #    p: p from the ols regression (for x)
    #    perm_p: p from the permutation test
    m_array = np.ones(len(names))
    c_array = np.ones(len(names))
    c14_array = np.ones(len(names))
    r_array = np.ones(len(names))
    p_array = np.ones(len(names))
    perm_p_array = np.ones(len(names))

    #----------------------------------------------------------------
    # Merge the data frames together
    df_xy = df_x.merge(df_y, on='nspn_id', how='inner')

    #----------------------------------------------------------------
    # Loop through all the regions and first regress out the
    # covariates and then record m, c, p and perm_p
    # for each region
    for i, roi in enumerate(names):

        results, perm_p = permutation_ols(df_xy['{}_x'.format(roi)].values,
                                          df_xy['{}_y'.format(roi)].values,
                                          covars_orig=covars_list,
                                          categorical=categorical,
                                          n_perm=n_perm)

        # Fill up your empty arrays with the useful values
        # from the OLS results
        #== Beta =========
        m_array[i] = results.params['x']

        #== Intercept ====
        if wbic_i:
            wbic_param = results.params['c_{}'.format(wbic_i)]
        else:
            wbic_param = 0
        c_array[i] = results.params['Intercept'] + wbic_param

        #== Int at 14 ====
        c14_array[i] = c_array[i] + 14 * m_array[i]

        #== Partial r ====
        t = results.tvalues['x']
        df_resid = results.df_resid
        if t < 0:
            direction = -1
        else:
            direction = 1
        r_array[i] = np.sqrt(t**2 / (t**2 + df_resid)) * direction

        #== p & perm_p ===
        p_array[i] = results.pvalues['x']
        perm_p_array[i] = perm_p

    #----------------------------------------------------------------
    # Calculate the fdr p values
    p_fdr_array = fdr(p_array)[1]
    p_fdr_mask = fdr(p_array)[0]

    perm_p_fdr_array = fdr(perm_p_array)[1]
    perm_p_fdr_mask = fdr(perm_p_array)[0]

    #----------------------------------------------------------------
    # Create masked versions of the slope array
    m_masked_p_array = np.copy(m_array)
    m_masked_p_array[p_array > 0.05] = -99

    m_masked_perm_p_array = np.copy(m_array)
    m_masked_perm_p_array[perm_p_array > 0.05] = -99

    m_masked_p_fdr_array = np.copy(m_array)
    m_masked_p_fdr_array[p_fdr_array > 0.05] = -99

    m_masked_perm_p_fdr_array = np.copy(m_array)
    m_masked_perm_p_fdr_array[perm_p_fdr_array > 0.05] = -99

    #----------------------------------------------------------------
    # Now save each of these arrays into the dictionary
    regional_linregress_dict['m'] = m_array
    regional_linregress_dict['c'] = c_array
    regional_linregress_dict['c14'] = c14_array
    regional_linregress_dict['r'] = r_array
    regional_linregress_dict['p'] = p_array
    regional_linregress_dict['perm_p'] = perm_p_array
    regional_linregress_dict['p_fdr'] = p_fdr_array
    regional_linregress_dict['perm_p_fdr'] = m_array
    regional_linregress_dict['m_masked_p'] = m_masked_p_array
    regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array
    regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array
    regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array

    # Return the regional regression dictionary
    return regional_linregress_dict
示例#6
0
dump(condensed_corr, out)
out.close()
out = open('condensed_pval.pkl', 'wb')
dump(condensed_pval, out)
out.close()

#
# if wanna load pre-computed condensed correlation and p-value matrices, uncomment bellow
#print "\t**Loading condensed matrices..."
#condensed_corr = load( open( 'condensed_corr.pkl' ) )
#condensed_pval = load( open( 'condensed_pval.pkl' ) )
#print "\t... done!\n"

print "\t**Correcting p-values..."
pvals_tested = condensed_pval[pd.notnull(condensed_pval)]
(rejecteds, corrected_pval) = fdr(pvals_tested, alpha=0.01)

pos = 0
should_reject_rho = []
for uncorrect in condensed_corr:
    if np.isnan(uncorrect):
        should_reject_rho.append(False)
    else:
        should_reject_rho.append(rejecteds[pos])
        pos += 1
print "\t... done!\n"

print "\t**Organizing condensed matrices into something you can understand (you dumbass!)..."
uncorrected_corr_matrix = squareform(condensed_corr)
for n in range(uncorrected_corr_matrix.shape[0]):
    uncorrected_corr_matrix[n][n] = 1.
    fits = get_all_fits(data, fitter, allow_new_computation=False)
    ds_fits = fits['kang2011']

    for b_reversed in [False, True]:
        regions = ['V1C', 'OFC']
        if b_reversed:
            regions = regions[::-1]

        scores = []
        for i, g in enumerate(data.gene_names):
            mu1 = ds_fits[(g, regions[0])].theta_samples[2, :]
            mu2 = ds_fits[(g, regions[1])].theta_samples[2, :]
            t, pval = ttest_ind(mu1, mu2)
            if mu1.mean() < mu2.mean():  # make it one sided: V1C < OFC
                pval = pval / 2
            else:
                pval = 1 - pval / 2
            scores.append((g, pval))

        # add FDR correction
        _, qvals = fdr([pval for g, pval in scores])
        scores = [(g, pval, qval) for (g, pval), qval in zip(scores, qvals)]

        filename_suffix = '-reversed' if b_reversed else ''
        create_top_genes_html(data,
                              fitter,
                              fits,
                              scores,
                              regions,
                              filename_suffix=filename_suffix)
for pathway in lst_pathways:
    data = GeneData.load('both').restrict_pathway(pathway).restrict_ages('EF3',PCW(10)).scale_ages(age_scaler)
    shape = Sigmoid(priors='sigmoid_wide')
    fitter = Fitter(shape, sigma_prior='normal')
    fits = get_all_fits(data, fitter, allow_new_computation=False)
    ds_fits = fits['kang2011']
    
    for b_reversed in [False,True]:
        regions = ['V1C', 'OFC']
        if b_reversed:
            regions = regions[::-1]
    
        scores = []
        for i,g in enumerate(data.gene_names):
            mu1 = ds_fits[(g,regions[0])].theta_samples[2,:]
            mu2 = ds_fits[(g,regions[1])].theta_samples[2,:]
            t,pval = ttest_ind(mu1,mu2)
            if mu1.mean() < mu2.mean(): # make it one sided: V1C < OFC
                pval = pval/2
            else:
                pval = 1 - pval/2
            scores.append( (g,pval) )
        
        # add FDR correction
        _,qvals = fdr([pval for g,pval in scores])
        scores = [(g,pval,qval) for (g,pval),qval in zip(scores,qvals)]
        
        filename_suffix = '-reversed' if b_reversed else ''
        create_top_genes_html(data,fitter,fits,scores,regions,filename_suffix=filename_suffix)
示例#9
0
def parcelwise_analysis(dirz, srch_str, varsheet, varlist):
    res = {}
    sigz = {}
    for direc in dirz:
        jnk, ref = os.path.split(direc)
        print 'working on directory %s' % (ref)
        print 'forming matrix'
        mapz = sorted(glob(os.path.join(direc, srch_str)))
        jnk = pandas.read_table(mapz[0], header=None)
        jnk.drop(jnk.columns[-1], axis=1, inplace=True)
        sz = len(jnk)
        mtx = np.full((sz, len(mapz)), np.nan)
        for i, map in enumerate(mapz):
            mdf = pandas.read_table(map, header=None)
            mdf.drop(mdf.columns[-1], axis=1, inplace=True)
            denz = mdf.sum(axis=1)
            for x, d in enumerate(denz.tolist()):
                mtx[x, i] = d

        print 'creating spreadsheet'
        if varsheet.split('.')[1][-3:] == 'csv':
            cdf = pandas.read_csv(varsheet)
        else:
            cdf = pandas.ExcelFile(varsheet).parse('Sheet1')

        if len(cdf) != mtx.shape[1]:
            raise IOError(
                'input varsheet must have the same number of rows as there are text files in your directories'
            )
        for v in varlist:
            if v not in cdf.columns.tolist():
                raise IOError(
                    'all items in varlist must correspond to columns in varsheet'
                )

        for j, sub in enumerate(cdf.index.tolist()):
            for i in range(len(mtx)):
                cdf.ix[sub, 'p%s_dens' % (i)] = mtx[i, j]

        print 'running models'
        rdf = pandas.DataFrame(np.full((len(mtx), 2), np.nan),
                               columns=['t', 'p'])

        for i in range(len(mtx)):
            stmnt = build_statement('p%s_dens' % (i), varlist)
            lm = smf.ols(stmnt, data=cdf).fit()
            rdf.ix[i + 1, 't'] = lm.tvalues[1]
            rdf.ix[i + 1, 'p'] = lm.pvalues[1]

        rdf.drop(rdf.index[0], axis=0, inplace=True)

        print 'correcting models'
        fdrtst = fdr(np.array(rdf[:]['p'].tolist()))
        fwetst = fwe(np.array(rdf[:]['p'].tolist()))
        for i in range(len(fdrtst[1])):
            rdf.ix[i + 1, 'fdr'] = fdrtst[1][i]
        for i in range(len(fwetst[1])):
            rdf.ix[i + 1, 'fwe'] = fwetst[1][i]

        res.update({ref: rdf})

        sig = []
        for parc in rdf.index.tolist():
            if rdf.ix[parc, 'fdr'] < 0.1 or rdf.ix[parc, 'fwe'] < 0.1:
                sig.append((parc, rdf.ix[parc, 'fdr'], rdf.ix[parc, 'fwe']))

        sigz.update({ref: sig})

    return res, sigz
def regional_linregress(df, x, names, covars=[], n_perm=1000, categorical=False):
    '''
    A function that calls a multiple regression model repeatedly for
    all variable names passed (as names) in the data frame as the dependent
    variable, with the x column as the independent variable and the 
    names in covars as covariates.
    
    INPUTS: 
        df ------------- pandas data frame
        x -------------- independent variable name (must be column in df)
        names ---------- list of variable names (columns in df) to loop
                           through as dependent variables (ys) for the regression
        covars --------- list containing variable names that should be controlled for
                           (must be columns in df and the same for all 
                           dependent variables)
                           Default value: [ ]
        n_perm --------- number of permutations for permutation testing
                           Default value: 1000
    
    RETURNS:
        m_array ------------------ numpy array containing slopes for each region
        c_array ------------------ numpy array containing intercepts (at 0) for each region
        c14_array ---------------- numpy array containing intercepts at 14 for each region
        r_array ------------------ numpy array containing partial r (correcting for covariates) for each region
        p_array ------------------ numpy array containing raw p values for each region
        perm_p_array ------------- numpy array containing raw permutation p values for each region
        p_fdr_array -------------- numpy array containing fdr corrected p values for each region
        perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region
        m_masked_p_array --------- numpy array containing the slope values for regions which
                                     are indivudially significant, otherwise -99 markers
        m_masked_perm_p_array ---- numpy array containing the slope values for regions which
                                     are indivudially significant according to permutation
                                     testing, otherwise -99 markers
        m_fdr_masked_array ------- numpy array containing the slope values for regions which
                                     pass fdr correction otherwise -99 markers
        m_perm_fdr_masked_array -- numpy array containing the slope values for regions which
                                     pass fdr correction according to permutation testing,
                                     otherwise -99 markers
    '''
    #----------------------------------------------------------------
    # Import what you need
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr
    import numpy as np
    from scipy.stats import linregress
    
    #----------------------------------------------------------------
    # Set up an empty dictionary to save all these values
    regional_linregress_dict = {}
    
    #----------------------------------------------------------------
    # Set up your covars_list
    # This should contain all the data for each covar as a different
    # element in the list
    # You're going to save the index at which wbic appears in the list
    # because you'll need to add the param for this covar to the 
    # intercept value later to get a value for a woman (if male is 
    # included in the covariates list) at wbic.
    # (You don't have to correct your intercept for being a woman 
    # because the male covariate is coded as 0 for women, but you do
    # have to correct for wbic because there 0 represents CBU)
    covars_list = []
    wbic_i = None
    
    for i, covar in enumerate(covars):
        covars_list += [df[covar].values]
        if covar == 'wbic':
            wbic_i = i
    
    #----------------------------------------------------------------
    # Set up some empty arrays to contain:
    #    m: slope of the regression line
    #    c: intercept as x = 0 + the parameter estimate
    #                          for the wbic scanner location if passed
    #    c14: intercept when x = 14 (c + 14 * m)
    #    r: partial r
    #    p: p from the ols regression (for x)
    #    perm_p: p from the permutation test
    m_array = np.ones(len(names))
    c_array = np.ones(len(names))
    c14_array = np.ones(len(names))
    r_array = np.ones(len(names))
    p_array = np.ones(len(names))
    perm_p_array = np.ones(len(names))

    #----------------------------------------------------------------
    # Loop through all the regions and first regress out the
    # covariates and then record m, c, r, p and perm_p
    # for each region
    for i, roi in enumerate(names):
    
        # Run the permutation test
        linregress_dict = permutation_correlation(df[x].values,
                                                    df[roi].values,
                                                    covars_orig=covars_list, 
                                                    n_perm=n_perm)
        
        # Run the regular ols to get the correct intercept values
        results, c, c14 = ols_correlation(df[x].values,
                                                    df[roi].values, 
                                                    covars=covars_list,
                                                    wbic_covars_index=wbic_i)
        
        # Add these values to the linregress_dict
        # (which means overwriting 'c' and adding in 'c14')
        linregress_dict['c'] = c
        linregress_dict['c14'] = c14

        # Fill up your empty arrays with the useful values
        #== Beta =========
        m_array[i] = results.params['x']
        
        #== Intercept ====
        c_array[i] = c
        
        #== Int at 14 ====
        c14_array[i] = c14
        
        #== Partial r ====
        r_array[i] = linregress_dict['r']

        #== p & perm_p ===
        p_array[i] = linregress_dict['p']
        perm_p_array[i] = linregress_dict['perm_p']
        
    #----------------------------------------------------------------
    # Calculate the fdr p values
    p_fdr_array = fdr(p_array)[1]
    p_fdr_mask = fdr(p_array)[0]
    
    perm_p_fdr_array = fdr(perm_p_array)[1]
    perm_p_fdr_mask = fdr(perm_p_array)[0]
    
    #----------------------------------------------------------------
    # Create masked versions of the slope array
    m_masked_p_array = np.copy(m_array)
    m_masked_p_array[p_array>0.05] = -99
    
    m_masked_perm_p_array = np.copy(m_array)
    m_masked_perm_p_array[perm_p_array>0.05] = -99
    
    m_masked_p_fdr_array = np.copy(m_array)
    m_masked_p_fdr_array[p_fdr_array>0.05] = -99
    
    m_masked_perm_p_fdr_array = np.copy(m_array)
    m_masked_perm_p_fdr_array[perm_p_fdr_array>0.05] = -99
    
    #----------------------------------------------------------------
    # Now save each of these arrays into the dictionary
    regional_linregress_dict['m'] = m_array
    regional_linregress_dict['c'] = c_array
    regional_linregress_dict['c14'] = c14_array
    regional_linregress_dict['r'] = r_array
    regional_linregress_dict['p'] = p_array
    regional_linregress_dict['perm_p'] = perm_p_array
    regional_linregress_dict['p_fdr'] = p_fdr_array
    regional_linregress_dict['perm_p_fdr'] = m_array
    regional_linregress_dict['m_masked_p'] = m_masked_p_array
    regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array
    regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array
    regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array
    
    # Return the regional regression dictionary
    return regional_linregress_dict
def regional_linregress_byregion(df_x, df_y, names, covars=[], n_perm=1000, categorical=False):
    '''
    A function that calls a multiple regression model repeatedly for
    each variable name (in names) with the data in df_y as the dependent
    variable, and the data in df_x as the independent variable. Data in 
    df_x named as in covars are passed as covariates.

    
    INPUTS: 
        df_x ----------- pandas data frame containing x axis values
        df_y ----------- pandas data frame containing y axis values
        covars --------- list containing variable names that should be controlled for
                           (must be columns in df_x and the same for all 
                           dependent variables)
                           Default value: [ ]
        names ---------- list of variable names (columns in df_x and df_y)
                           to loop though and conduct pairwise regressions
        n_perm --------- number of permutations for permutation testing
                           Default value: 1000
        categorical ---- boolean indicating whether you want to permute
                           and return the Fstatistic (if True) or the parameter
                           estimate of the x variable (if False)
                           Default value: False
    
    RETURNS:
        m_array ------------------ numpy array containing slopes for each region
        c_array ------------------ numpy array containing intercepts (at 0) for each region
        c14_array ---------------- numpy array containing intercepts at 14 for each region
        r_array ------------------ numpy array containing partial r (correcting for covariates) for each region
        p_array ------------------ numpy array containing raw p values for each region
        perm_p_array ------------- numpy array containing raw permutation p values for each region
        p_fdr_array -------------- numpy array containing fdr corrected p values for each region
        perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region
        m_masked_p_array --------- numpy array containing the slope values for regions which
                                     are indivudially significant, otherwise -99 markers
        m_masked_perm_p_array ---- numpy array containing the slope values for regions which
                                     are indivudially significant according to permutation
                                     testing, otherwise -99 markers
        m_fdr_masked_array ------- numpy array containing the slope values for regions which
                                     pass fdr correction otherwise -99 markers
        m_perm_fdr_masked_array -- numpy array containing the slope values for regions which
                                     pass fdr correction according to permutation testing,
                                     otherwise -99 markers
    '''
    #----------------------------------------------------------------
    # Import what you need
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr
    import numpy as np
    from scipy.stats import linregress
    
    #----------------------------------------------------------------
    # Set up an empty dictionary to save all these values
    regional_linregress_dict = {}
    
    #----------------------------------------------------------------
    # Set up your covars_list
    # This should contain all the data for each covar as a different
    # element in the list
    # You're going to save the index at which wbic appears in the list
    # because you'll need to add the param for this covar to the 
    # intercept value later to get a value for a woman (if male is 
    # included in the covariates list) at wbic.
    # (You don't have to correct your intercept for being a woman 
    # because the male covariate is coded as 0 for women, but you do
    # have to correct for wbic because there 0 represents CBU)
    covars_list = []
    wbic_i = None
    
    for i, covar in enumerate(covars):
        covars_list += [df_x[covar].values]
        if covar == 'wbic':
            wbic_i = i
    
    #----------------------------------------------------------------
    # Set up some empty arrays to contain:
    #    m: slope of the regression line
    #    c: intercept as x = 0 + the parameter estimate
    #                          for the wbic scanner location if passed
    #    c14: intercept when x = 14 (c + 14 * m)
    #    r: partial r
    #    p: p from the ols regression (for x)
    #    perm_p: p from the permutation test
    m_array = np.ones(len(names))
    c_array = np.ones(len(names))
    c14_array = np.ones(len(names))
    r_array = np.ones(len(names))
    p_array = np.ones(len(names))
    perm_p_array = np.ones(len(names))

    #----------------------------------------------------------------
    # Merge the data frames together
    df_xy = df_x.merge(df_y, on='nspn_id', how='inner')
    
    #----------------------------------------------------------------
    # Loop through all the regions and first regress out the
    # covariates and then record m, c, p and perm_p
    # for each region
    for i, roi in enumerate(names):
    
        results, perm_p = permutation_ols(df_xy['{}_x'.format(roi)].values,
                                            df_xy['{}_y'.format(roi)].values,
                                            covars_orig=covars_list, 
                                            categorical=categorical,
                                            n_perm=n_perm)
        
        # Fill up your empty arrays with the useful values
        # from the OLS results
        #== Beta =========
        m_array[i] = results.params['x']
        
        #== Intercept ====
        if wbic_i:
            wbic_param = results.params['c_{}'.format(wbic_i)]
        else:
            wbic_param = 0
        c_array[i] = results.params['Intercept'] + wbic_param
        
        #== Int at 14 ====
        c14_array[i] = c_array[i] + 14*m_array[i]
        
        #== Partial r ====
        t = results.tvalues['x']
        df_resid = results.df_resid
        if t < 0:
            direction = -1
        else:
            direction = 1
        r_array[i] = np.sqrt(t**2 / (t**2 + df_resid)) * direction

        #== p & perm_p ===
        p_array[i] = results.pvalues['x']
        perm_p_array[i] = perm_p
        
    #----------------------------------------------------------------
    # Calculate the fdr p values
    p_fdr_array = fdr(p_array)[1]
    p_fdr_mask = fdr(p_array)[0]
    
    perm_p_fdr_array = fdr(perm_p_array)[1]
    perm_p_fdr_mask = fdr(perm_p_array)[0]
    
    #----------------------------------------------------------------
    # Create masked versions of the slope array
    m_masked_p_array = np.copy(m_array)
    m_masked_p_array[p_array>0.05] = -99
    
    m_masked_perm_p_array = np.copy(m_array)
    m_masked_perm_p_array[perm_p_array>0.05] = -99
    
    m_masked_p_fdr_array = np.copy(m_array)
    m_masked_p_fdr_array[p_fdr_array>0.05] = -99
    
    m_masked_perm_p_fdr_array = np.copy(m_array)
    m_masked_perm_p_fdr_array[perm_p_fdr_array>0.05] = -99
    
    #----------------------------------------------------------------
    # Now save each of these arrays into the dictionary
    regional_linregress_dict['m'] = m_array
    regional_linregress_dict['c'] = c_array
    regional_linregress_dict['c14'] = c14_array
    regional_linregress_dict['r'] = r_array
    regional_linregress_dict['p'] = p_array
    regional_linregress_dict['perm_p'] = perm_p_array
    regional_linregress_dict['p_fdr'] = p_fdr_array
    regional_linregress_dict['perm_p_fdr'] = m_array
    regional_linregress_dict['m_masked_p'] = m_masked_p_array
    regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array
    regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array
    regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array
    
    # Return the regional regression dictionary
    return regional_linregress_dict
def parcelwise_analysis(dirz,srch_str,varsheet,varlist):
    res = {}
    sigz = {}
    for direc in dirz:
        jnk,ref = os.path.split(direc)
        print 'working on directory %s'%(ref)
        print 'forming matrix'
        mapz = sorted(glob(os.path.join(direc,srch_str)))
        jnk = pandas.read_table(mapz[0],header=None)
        jnk.drop(jnk.columns[-1],axis=1,inplace=True)
        sz = len(jnk)
        mtx = np.full((sz,len(mapz)),np.nan)
        for i,map in enumerate(mapz):
            mdf = pandas.read_table(map,header=None)
            mdf.drop(mdf.columns[-1],axis=1,inplace=True)
            denz = mdf.sum(axis=1)
            for x,d in enumerate(denz.tolist()):
                mtx[x,i] = d

        print 'creating spreadsheet'
        if varsheet.split('.')[1][-3:] == 'csv':
            cdf = pandas.read_csv(varsheet)
        else:
            cdf = pandas.ExcelFile(varsheet).parse('Sheet1')

        if len(cdf) != mtx.shape[1]:
            raise IOError('input varsheet must have the same number of rows as there are text files in your directories')
        for v in varlist:
            if v not in cdf.columns.tolist():
                raise IOError('all items in varlist must correspond to columns in varsheet')

        for j,sub in enumerate(cdf.index.tolist()):
            for i in range(len(mtx)):
                cdf.ix[sub,'p%s_dens'%(i)] = mtx[i,j]

        print 'running models'
        rdf = pandas.DataFrame(np.full((len(mtx),2),np.nan),
            columns =['t','p'])

        for i in range(len(mtx)):
            stmnt = build_statement('p%s_dens'%(i),varlist)
            lm = smf.ols(stmnt,data=cdf).fit()
            rdf.ix[i+1,'t'] = lm.tvalues[1]
            rdf.ix[i+1,'p'] = lm.pvalues[1]

        rdf.drop(rdf.index[0],axis=0,inplace=True)

        print 'correcting models'
        fdrtst = fdr(np.array(rdf[:]['p'].tolist()))
        fwetst = fwe(np.array(rdf[:]['p'].tolist()))
        for i in range(len(fdrtst[1])):
            rdf.ix[i+1,'fdr'] = fdrtst[1][i]
        for i in range(len(fwetst[1])):
            rdf.ix[i+1,'fwe'] = fwetst[1][i]

        res.update({ref: rdf})

        sig = []
        for parc in rdf.index.tolist():
            if rdf.ix[parc,'fdr'] < 0.1 or rdf.ix[parc,'fwe'] < 0.1:
                sig.append((parc,rdf.ix[parc,'fdr'],rdf.ix[parc,'fwe']))

        sigz.update({ref: sig})

    return res,sigz