def regional_linregress(df, x, aparc_names): ''' regional_linregress INPUTS: df ------------- pandas data frame x -------------- independent variable name (must be column in df) aparc_names ---- list of variable names (columns in df) to loop through as dependent variables for the regression RETURNS: m_array -------------- numpy array containing slopes for each region c_array -------------- numpy array containing intercepts (at 0) for each region r_array -------------- numpy array containing pearson r values for each region p_array -------------- numpy array containing raw p values for each region p_fdr_array ---------- numpy array containing fdr corrected p values for each region m_masked_array ------- numpy array containing the slope values for regions which are indivudially significant otherwise -99 markers m_fdr_masked_array --- numpy array containing the slope values for regions which pass fdr correction otherwise -99 markers ''' # Import what you need from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr import numpy as np from scipy.stats import linregress # Set up some empty arrays # to contain the slope of the regression line (m) # the intercept at x = 0 (c) # the list of raw p values (p) # and the r values (r) for each region. m_array = np.ones(len(aparc_names)) c_array = np.ones(len(aparc_names)) p_array = np.ones(len(aparc_names)) r_array = np.ones(len(aparc_names)) # Loop through all the regions and record m, p and r for each region for i, roi in enumerate(aparc_names): m, c, r, p, std_err = linregress(df[x].values, df[roi].values) m_array[i] = m c_array[i] = c r_array[i] = r p_array[i] = p # Calculate the fdr p values p_fdr_array = fdr(p_array)[1] p_fdr_mask = fdr(p_array)[0] # Create two masked versions of the slope array m_masked_array = np.copy(m_array) m_masked_array[p_array>0.05] = -99 m_fdr_masked_array = np.copy(m_array) m_fdr_masked_array[p_fdr_array>0.05] = -99 # Return the arrays return m_array, c_array, r_array, p_array, p_fdr_array, m_masked_array, m_fdr_masked_array
def regional_ttest(df, cols_list): # Create a list of t and p values # and the mean and standard deviations # for each region t_list = [] p_list = [] mean_list = [] std_list = [] stars_list = [] # Now loop through these regions for col in cols_list: # Save the mean and standard deviation values mean_list += [df.loc[df[col].notnull(), col].mean()] std_list += [df.loc[df[col].notnull(), col].std()] # Conduct the t-test regionally t, p = ttest_1samp(df.loc[df[col].notnull(), col], 0) t_list += [t] p_list += [p] # Get a "star" value for this test so you can print it nicely # NOTE that these are not corrected star = 'ns' if p < 0.05: star = '*' if p < 0.01: star = '**' if p < 0.001: star = '***' stars_list += [star] # Calculate the fdr corrected p values fdr_mask, fdr_ps = fdr(np.array(p_list)) # Turn these values into a dictionary ttest_dict = { 'regions': cols_list, 'means': np.array(mean_list), 'stds': np.array(std_list), 'ts': np.array(t_list), 'ps': np.array(p_list), 'fdr_ps': np.array(fdr_ps), 'stars': np.array(stars_list) } return ttest_dict
print time() - start_time out = open('condensed_corr.pkl', 'wb') dump(condensed_corr, out) out.close() out = open('condensed_pval.pkl', 'wb') dump(condensed_pval, out) out.close() print "\t**Loading condensed matrices..." condensed_corr = load( open( 'condensed_corr.pkl' ) ) condensed_pval = load( open( 'condensed_pval.pkl' ) ) print "\t... done!\n" print "\t**Correcting p-values..." (rejecteds, corrected_pval) = fdr(condensed_pval, alpha=0.01) print "\t... done!\n" print "\t**Organizing condensed matrices into something you can understand (you dumbass!)..." uncorrected_corr_matrix = squareform(condensed_corr) for n in range(uncorrected_corr_matrix.shape[0]): uncorrected_corr_matrix[n][n] = 1. uncorrected_corr_df = pd.DataFrame(index=all_species_distances.index, columns=all_species_distances.index, data=uncorrected_corr_matrix) rejecteds_df = pd.DataFrame(index=all_species_distances.index, columns=all_species_distances.index, data=squareform(rejecteds) > 0) corr_df = uncorrected_corr_df[rejecteds_df] corr_df.to_csv('significant_group_correlations-based_on_distances-bak.tab', sep='\t') print "\t... done!\n" # # network time!
def regional_linregress(df, x, names, covars=[], n_perm=1000, categorical=False): ''' A function that calls a multiple regression model repeatedly for all variable names passed (as names) in the data frame as the dependent variable, with the x column as the independent variable and the names in covars as covariates. INPUTS: df ------------- pandas data frame x -------------- independent variable name (must be column in df) names ---------- list of variable names (columns in df) to loop through as dependent variables (ys) for the regression covars --------- list containing variable names that should be controlled for (must be columns in df and the same for all dependent variables) Default value: [ ] n_perm --------- number of permutations for permutation testing Default value: 1000 RETURNS: m_array ------------------ numpy array containing slopes for each region c_array ------------------ numpy array containing intercepts (at 0) for each region c14_array ---------------- numpy array containing intercepts at 14 for each region r_array ------------------ numpy array containing partial r (correcting for covariates) for each region p_array ------------------ numpy array containing raw p values for each region perm_p_array ------------- numpy array containing raw permutation p values for each region p_fdr_array -------------- numpy array containing fdr corrected p values for each region perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region m_masked_p_array --------- numpy array containing the slope values for regions which are indivudially significant, otherwise -99 markers m_masked_perm_p_array ---- numpy array containing the slope values for regions which are indivudially significant according to permutation testing, otherwise -99 markers m_fdr_masked_array ------- numpy array containing the slope values for regions which pass fdr correction otherwise -99 markers m_perm_fdr_masked_array -- numpy array containing the slope values for regions which pass fdr correction according to permutation testing, otherwise -99 markers ''' #---------------------------------------------------------------- # Import what you need from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr import numpy as np from scipy.stats import linregress #---------------------------------------------------------------- # Set up an empty dictionary to save all these values regional_linregress_dict = {} #---------------------------------------------------------------- # Set up your covars_list # This should contain all the data for each covar as a different # element in the list # You're going to save the index at which wbic appears in the list # because you'll need to add the param for this covar to the # intercept value later to get a value for a woman (if male is # included in the covariates list) at wbic. # (You don't have to correct your intercept for being a woman # because the male covariate is coded as 0 for women, but you do # have to correct for wbic because there 0 represents CBU) covars_list = [] wbic_i = None for i, covar in enumerate(covars): covars_list += [df[covar].values] if covar == 'wbic': wbic_i = i #---------------------------------------------------------------- # Set up some empty arrays to contain: # m: slope of the regression line # c: intercept as x = 0 + the parameter estimate # for the wbic scanner location if passed # c14: intercept when x = 14 (c + 14 * m) # r: partial r # p: p from the ols regression (for x) # perm_p: p from the permutation test m_array = np.ones(len(names)) c_array = np.ones(len(names)) c14_array = np.ones(len(names)) r_array = np.ones(len(names)) p_array = np.ones(len(names)) perm_p_array = np.ones(len(names)) #---------------------------------------------------------------- # Loop through all the regions and first regress out the # covariates and then record m, c, r, p and perm_p # for each region for i, roi in enumerate(names): # Run the permutation test linregress_dict = permutation_correlation(df[x].values, df[roi].values, covars_orig=covars_list, n_perm=n_perm) # Run the regular ols to get the correct intercept values results, c, c14 = ols_correlation(df[x].values, df[roi].values, covars=covars_list, wbic_covars_index=wbic_i) # Add these values to the linregress_dict # (which means overwriting 'c' and adding in 'c14') linregress_dict['c'] = c linregress_dict['c14'] = c14 # Fill up your empty arrays with the useful values #== Beta ========= m_array[i] = results.params['x'] #== Intercept ==== c_array[i] = c #== Int at 14 ==== c14_array[i] = c14 #== Partial r ==== r_array[i] = linregress_dict['r'] #== p & perm_p === p_array[i] = linregress_dict['p'] perm_p_array[i] = linregress_dict['perm_p'] #---------------------------------------------------------------- # Calculate the fdr p values p_fdr_array = fdr(p_array)[1] p_fdr_mask = fdr(p_array)[0] perm_p_fdr_array = fdr(perm_p_array)[1] perm_p_fdr_mask = fdr(perm_p_array)[0] #---------------------------------------------------------------- # Create masked versions of the slope array m_masked_p_array = np.copy(m_array) m_masked_p_array[p_array > 0.05] = -99 m_masked_perm_p_array = np.copy(m_array) m_masked_perm_p_array[perm_p_array > 0.05] = -99 m_masked_p_fdr_array = np.copy(m_array) m_masked_p_fdr_array[p_fdr_array > 0.05] = -99 m_masked_perm_p_fdr_array = np.copy(m_array) m_masked_perm_p_fdr_array[perm_p_fdr_array > 0.05] = -99 #---------------------------------------------------------------- # Now save each of these arrays into the dictionary regional_linregress_dict['m'] = m_array regional_linregress_dict['c'] = c_array regional_linregress_dict['c14'] = c14_array regional_linregress_dict['r'] = r_array regional_linregress_dict['p'] = p_array regional_linregress_dict['perm_p'] = perm_p_array regional_linregress_dict['p_fdr'] = p_fdr_array regional_linregress_dict['perm_p_fdr'] = m_array regional_linregress_dict['m_masked_p'] = m_masked_p_array regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array # Return the regional regression dictionary return regional_linregress_dict
def regional_linregress_byregion(df_x, df_y, names, covars=[], n_perm=1000, categorical=False): ''' A function that calls a multiple regression model repeatedly for each variable name (in names) with the data in df_y as the dependent variable, and the data in df_x as the independent variable. Data in df_x named as in covars are passed as covariates. INPUTS: df_x ----------- pandas data frame containing x axis values df_y ----------- pandas data frame containing y axis values covars --------- list containing variable names that should be controlled for (must be columns in df_x and the same for all dependent variables) Default value: [ ] names ---------- list of variable names (columns in df_x and df_y) to loop though and conduct pairwise regressions n_perm --------- number of permutations for permutation testing Default value: 1000 categorical ---- boolean indicating whether you want to permute and return the Fstatistic (if True) or the parameter estimate of the x variable (if False) Default value: False RETURNS: m_array ------------------ numpy array containing slopes for each region c_array ------------------ numpy array containing intercepts (at 0) for each region c14_array ---------------- numpy array containing intercepts at 14 for each region r_array ------------------ numpy array containing partial r (correcting for covariates) for each region p_array ------------------ numpy array containing raw p values for each region perm_p_array ------------- numpy array containing raw permutation p values for each region p_fdr_array -------------- numpy array containing fdr corrected p values for each region perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region m_masked_p_array --------- numpy array containing the slope values for regions which are indivudially significant, otherwise -99 markers m_masked_perm_p_array ---- numpy array containing the slope values for regions which are indivudially significant according to permutation testing, otherwise -99 markers m_fdr_masked_array ------- numpy array containing the slope values for regions which pass fdr correction otherwise -99 markers m_perm_fdr_masked_array -- numpy array containing the slope values for regions which pass fdr correction according to permutation testing, otherwise -99 markers ''' #---------------------------------------------------------------- # Import what you need from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr import numpy as np from scipy.stats import linregress #---------------------------------------------------------------- # Set up an empty dictionary to save all these values regional_linregress_dict = {} #---------------------------------------------------------------- # Set up your covars_list # This should contain all the data for each covar as a different # element in the list # You're going to save the index at which wbic appears in the list # because you'll need to add the param for this covar to the # intercept value later to get a value for a woman (if male is # included in the covariates list) at wbic. # (You don't have to correct your intercept for being a woman # because the male covariate is coded as 0 for women, but you do # have to correct for wbic because there 0 represents CBU) covars_list = [] wbic_i = None for i, covar in enumerate(covars): covars_list += [df_x[covar].values] if covar == 'wbic': wbic_i = i #---------------------------------------------------------------- # Set up some empty arrays to contain: # m: slope of the regression line # c: intercept as x = 0 + the parameter estimate # for the wbic scanner location if passed # c14: intercept when x = 14 (c + 14 * m) # r: partial r # p: p from the ols regression (for x) # perm_p: p from the permutation test m_array = np.ones(len(names)) c_array = np.ones(len(names)) c14_array = np.ones(len(names)) r_array = np.ones(len(names)) p_array = np.ones(len(names)) perm_p_array = np.ones(len(names)) #---------------------------------------------------------------- # Merge the data frames together df_xy = df_x.merge(df_y, on='nspn_id', how='inner') #---------------------------------------------------------------- # Loop through all the regions and first regress out the # covariates and then record m, c, p and perm_p # for each region for i, roi in enumerate(names): results, perm_p = permutation_ols(df_xy['{}_x'.format(roi)].values, df_xy['{}_y'.format(roi)].values, covars_orig=covars_list, categorical=categorical, n_perm=n_perm) # Fill up your empty arrays with the useful values # from the OLS results #== Beta ========= m_array[i] = results.params['x'] #== Intercept ==== if wbic_i: wbic_param = results.params['c_{}'.format(wbic_i)] else: wbic_param = 0 c_array[i] = results.params['Intercept'] + wbic_param #== Int at 14 ==== c14_array[i] = c_array[i] + 14 * m_array[i] #== Partial r ==== t = results.tvalues['x'] df_resid = results.df_resid if t < 0: direction = -1 else: direction = 1 r_array[i] = np.sqrt(t**2 / (t**2 + df_resid)) * direction #== p & perm_p === p_array[i] = results.pvalues['x'] perm_p_array[i] = perm_p #---------------------------------------------------------------- # Calculate the fdr p values p_fdr_array = fdr(p_array)[1] p_fdr_mask = fdr(p_array)[0] perm_p_fdr_array = fdr(perm_p_array)[1] perm_p_fdr_mask = fdr(perm_p_array)[0] #---------------------------------------------------------------- # Create masked versions of the slope array m_masked_p_array = np.copy(m_array) m_masked_p_array[p_array > 0.05] = -99 m_masked_perm_p_array = np.copy(m_array) m_masked_perm_p_array[perm_p_array > 0.05] = -99 m_masked_p_fdr_array = np.copy(m_array) m_masked_p_fdr_array[p_fdr_array > 0.05] = -99 m_masked_perm_p_fdr_array = np.copy(m_array) m_masked_perm_p_fdr_array[perm_p_fdr_array > 0.05] = -99 #---------------------------------------------------------------- # Now save each of these arrays into the dictionary regional_linregress_dict['m'] = m_array regional_linregress_dict['c'] = c_array regional_linregress_dict['c14'] = c14_array regional_linregress_dict['r'] = r_array regional_linregress_dict['p'] = p_array regional_linregress_dict['perm_p'] = perm_p_array regional_linregress_dict['p_fdr'] = p_fdr_array regional_linregress_dict['perm_p_fdr'] = m_array regional_linregress_dict['m_masked_p'] = m_masked_p_array regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array # Return the regional regression dictionary return regional_linregress_dict
dump(condensed_corr, out) out.close() out = open('condensed_pval.pkl', 'wb') dump(condensed_pval, out) out.close() # # if wanna load pre-computed condensed correlation and p-value matrices, uncomment bellow #print "\t**Loading condensed matrices..." #condensed_corr = load( open( 'condensed_corr.pkl' ) ) #condensed_pval = load( open( 'condensed_pval.pkl' ) ) #print "\t... done!\n" print "\t**Correcting p-values..." pvals_tested = condensed_pval[pd.notnull(condensed_pval)] (rejecteds, corrected_pval) = fdr(pvals_tested, alpha=0.01) pos = 0 should_reject_rho = [] for uncorrect in condensed_corr: if np.isnan(uncorrect): should_reject_rho.append(False) else: should_reject_rho.append(rejecteds[pos]) pos += 1 print "\t... done!\n" print "\t**Organizing condensed matrices into something you can understand (you dumbass!)..." uncorrected_corr_matrix = squareform(condensed_corr) for n in range(uncorrected_corr_matrix.shape[0]): uncorrected_corr_matrix[n][n] = 1.
fits = get_all_fits(data, fitter, allow_new_computation=False) ds_fits = fits['kang2011'] for b_reversed in [False, True]: regions = ['V1C', 'OFC'] if b_reversed: regions = regions[::-1] scores = [] for i, g in enumerate(data.gene_names): mu1 = ds_fits[(g, regions[0])].theta_samples[2, :] mu2 = ds_fits[(g, regions[1])].theta_samples[2, :] t, pval = ttest_ind(mu1, mu2) if mu1.mean() < mu2.mean(): # make it one sided: V1C < OFC pval = pval / 2 else: pval = 1 - pval / 2 scores.append((g, pval)) # add FDR correction _, qvals = fdr([pval for g, pval in scores]) scores = [(g, pval, qval) for (g, pval), qval in zip(scores, qvals)] filename_suffix = '-reversed' if b_reversed else '' create_top_genes_html(data, fitter, fits, scores, regions, filename_suffix=filename_suffix)
for pathway in lst_pathways: data = GeneData.load('both').restrict_pathway(pathway).restrict_ages('EF3',PCW(10)).scale_ages(age_scaler) shape = Sigmoid(priors='sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter, allow_new_computation=False) ds_fits = fits['kang2011'] for b_reversed in [False,True]: regions = ['V1C', 'OFC'] if b_reversed: regions = regions[::-1] scores = [] for i,g in enumerate(data.gene_names): mu1 = ds_fits[(g,regions[0])].theta_samples[2,:] mu2 = ds_fits[(g,regions[1])].theta_samples[2,:] t,pval = ttest_ind(mu1,mu2) if mu1.mean() < mu2.mean(): # make it one sided: V1C < OFC pval = pval/2 else: pval = 1 - pval/2 scores.append( (g,pval) ) # add FDR correction _,qvals = fdr([pval for g,pval in scores]) scores = [(g,pval,qval) for (g,pval),qval in zip(scores,qvals)] filename_suffix = '-reversed' if b_reversed else '' create_top_genes_html(data,fitter,fits,scores,regions,filename_suffix=filename_suffix)
def parcelwise_analysis(dirz, srch_str, varsheet, varlist): res = {} sigz = {} for direc in dirz: jnk, ref = os.path.split(direc) print 'working on directory %s' % (ref) print 'forming matrix' mapz = sorted(glob(os.path.join(direc, srch_str))) jnk = pandas.read_table(mapz[0], header=None) jnk.drop(jnk.columns[-1], axis=1, inplace=True) sz = len(jnk) mtx = np.full((sz, len(mapz)), np.nan) for i, map in enumerate(mapz): mdf = pandas.read_table(map, header=None) mdf.drop(mdf.columns[-1], axis=1, inplace=True) denz = mdf.sum(axis=1) for x, d in enumerate(denz.tolist()): mtx[x, i] = d print 'creating spreadsheet' if varsheet.split('.')[1][-3:] == 'csv': cdf = pandas.read_csv(varsheet) else: cdf = pandas.ExcelFile(varsheet).parse('Sheet1') if len(cdf) != mtx.shape[1]: raise IOError( 'input varsheet must have the same number of rows as there are text files in your directories' ) for v in varlist: if v not in cdf.columns.tolist(): raise IOError( 'all items in varlist must correspond to columns in varsheet' ) for j, sub in enumerate(cdf.index.tolist()): for i in range(len(mtx)): cdf.ix[sub, 'p%s_dens' % (i)] = mtx[i, j] print 'running models' rdf = pandas.DataFrame(np.full((len(mtx), 2), np.nan), columns=['t', 'p']) for i in range(len(mtx)): stmnt = build_statement('p%s_dens' % (i), varlist) lm = smf.ols(stmnt, data=cdf).fit() rdf.ix[i + 1, 't'] = lm.tvalues[1] rdf.ix[i + 1, 'p'] = lm.pvalues[1] rdf.drop(rdf.index[0], axis=0, inplace=True) print 'correcting models' fdrtst = fdr(np.array(rdf[:]['p'].tolist())) fwetst = fwe(np.array(rdf[:]['p'].tolist())) for i in range(len(fdrtst[1])): rdf.ix[i + 1, 'fdr'] = fdrtst[1][i] for i in range(len(fwetst[1])): rdf.ix[i + 1, 'fwe'] = fwetst[1][i] res.update({ref: rdf}) sig = [] for parc in rdf.index.tolist(): if rdf.ix[parc, 'fdr'] < 0.1 or rdf.ix[parc, 'fwe'] < 0.1: sig.append((parc, rdf.ix[parc, 'fdr'], rdf.ix[parc, 'fwe'])) sigz.update({ref: sig}) return res, sigz
def regional_linregress(df, x, names, covars=[], n_perm=1000, categorical=False): ''' A function that calls a multiple regression model repeatedly for all variable names passed (as names) in the data frame as the dependent variable, with the x column as the independent variable and the names in covars as covariates. INPUTS: df ------------- pandas data frame x -------------- independent variable name (must be column in df) names ---------- list of variable names (columns in df) to loop through as dependent variables (ys) for the regression covars --------- list containing variable names that should be controlled for (must be columns in df and the same for all dependent variables) Default value: [ ] n_perm --------- number of permutations for permutation testing Default value: 1000 RETURNS: m_array ------------------ numpy array containing slopes for each region c_array ------------------ numpy array containing intercepts (at 0) for each region c14_array ---------------- numpy array containing intercepts at 14 for each region r_array ------------------ numpy array containing partial r (correcting for covariates) for each region p_array ------------------ numpy array containing raw p values for each region perm_p_array ------------- numpy array containing raw permutation p values for each region p_fdr_array -------------- numpy array containing fdr corrected p values for each region perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region m_masked_p_array --------- numpy array containing the slope values for regions which are indivudially significant, otherwise -99 markers m_masked_perm_p_array ---- numpy array containing the slope values for regions which are indivudially significant according to permutation testing, otherwise -99 markers m_fdr_masked_array ------- numpy array containing the slope values for regions which pass fdr correction otherwise -99 markers m_perm_fdr_masked_array -- numpy array containing the slope values for regions which pass fdr correction according to permutation testing, otherwise -99 markers ''' #---------------------------------------------------------------- # Import what you need from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr import numpy as np from scipy.stats import linregress #---------------------------------------------------------------- # Set up an empty dictionary to save all these values regional_linregress_dict = {} #---------------------------------------------------------------- # Set up your covars_list # This should contain all the data for each covar as a different # element in the list # You're going to save the index at which wbic appears in the list # because you'll need to add the param for this covar to the # intercept value later to get a value for a woman (if male is # included in the covariates list) at wbic. # (You don't have to correct your intercept for being a woman # because the male covariate is coded as 0 for women, but you do # have to correct for wbic because there 0 represents CBU) covars_list = [] wbic_i = None for i, covar in enumerate(covars): covars_list += [df[covar].values] if covar == 'wbic': wbic_i = i #---------------------------------------------------------------- # Set up some empty arrays to contain: # m: slope of the regression line # c: intercept as x = 0 + the parameter estimate # for the wbic scanner location if passed # c14: intercept when x = 14 (c + 14 * m) # r: partial r # p: p from the ols regression (for x) # perm_p: p from the permutation test m_array = np.ones(len(names)) c_array = np.ones(len(names)) c14_array = np.ones(len(names)) r_array = np.ones(len(names)) p_array = np.ones(len(names)) perm_p_array = np.ones(len(names)) #---------------------------------------------------------------- # Loop through all the regions and first regress out the # covariates and then record m, c, r, p and perm_p # for each region for i, roi in enumerate(names): # Run the permutation test linregress_dict = permutation_correlation(df[x].values, df[roi].values, covars_orig=covars_list, n_perm=n_perm) # Run the regular ols to get the correct intercept values results, c, c14 = ols_correlation(df[x].values, df[roi].values, covars=covars_list, wbic_covars_index=wbic_i) # Add these values to the linregress_dict # (which means overwriting 'c' and adding in 'c14') linregress_dict['c'] = c linregress_dict['c14'] = c14 # Fill up your empty arrays with the useful values #== Beta ========= m_array[i] = results.params['x'] #== Intercept ==== c_array[i] = c #== Int at 14 ==== c14_array[i] = c14 #== Partial r ==== r_array[i] = linregress_dict['r'] #== p & perm_p === p_array[i] = linregress_dict['p'] perm_p_array[i] = linregress_dict['perm_p'] #---------------------------------------------------------------- # Calculate the fdr p values p_fdr_array = fdr(p_array)[1] p_fdr_mask = fdr(p_array)[0] perm_p_fdr_array = fdr(perm_p_array)[1] perm_p_fdr_mask = fdr(perm_p_array)[0] #---------------------------------------------------------------- # Create masked versions of the slope array m_masked_p_array = np.copy(m_array) m_masked_p_array[p_array>0.05] = -99 m_masked_perm_p_array = np.copy(m_array) m_masked_perm_p_array[perm_p_array>0.05] = -99 m_masked_p_fdr_array = np.copy(m_array) m_masked_p_fdr_array[p_fdr_array>0.05] = -99 m_masked_perm_p_fdr_array = np.copy(m_array) m_masked_perm_p_fdr_array[perm_p_fdr_array>0.05] = -99 #---------------------------------------------------------------- # Now save each of these arrays into the dictionary regional_linregress_dict['m'] = m_array regional_linregress_dict['c'] = c_array regional_linregress_dict['c14'] = c14_array regional_linregress_dict['r'] = r_array regional_linregress_dict['p'] = p_array regional_linregress_dict['perm_p'] = perm_p_array regional_linregress_dict['p_fdr'] = p_fdr_array regional_linregress_dict['perm_p_fdr'] = m_array regional_linregress_dict['m_masked_p'] = m_masked_p_array regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array # Return the regional regression dictionary return regional_linregress_dict
def regional_linregress_byregion(df_x, df_y, names, covars=[], n_perm=1000, categorical=False): ''' A function that calls a multiple regression model repeatedly for each variable name (in names) with the data in df_y as the dependent variable, and the data in df_x as the independent variable. Data in df_x named as in covars are passed as covariates. INPUTS: df_x ----------- pandas data frame containing x axis values df_y ----------- pandas data frame containing y axis values covars --------- list containing variable names that should be controlled for (must be columns in df_x and the same for all dependent variables) Default value: [ ] names ---------- list of variable names (columns in df_x and df_y) to loop though and conduct pairwise regressions n_perm --------- number of permutations for permutation testing Default value: 1000 categorical ---- boolean indicating whether you want to permute and return the Fstatistic (if True) or the parameter estimate of the x variable (if False) Default value: False RETURNS: m_array ------------------ numpy array containing slopes for each region c_array ------------------ numpy array containing intercepts (at 0) for each region c14_array ---------------- numpy array containing intercepts at 14 for each region r_array ------------------ numpy array containing partial r (correcting for covariates) for each region p_array ------------------ numpy array containing raw p values for each region perm_p_array ------------- numpy array containing raw permutation p values for each region p_fdr_array -------------- numpy array containing fdr corrected p values for each region perm_p_fdr_array --------- numpy array containing fdr corrected permutation p values for each region m_masked_p_array --------- numpy array containing the slope values for regions which are indivudially significant, otherwise -99 markers m_masked_perm_p_array ---- numpy array containing the slope values for regions which are indivudially significant according to permutation testing, otherwise -99 markers m_fdr_masked_array ------- numpy array containing the slope values for regions which pass fdr correction otherwise -99 markers m_perm_fdr_masked_array -- numpy array containing the slope values for regions which pass fdr correction according to permutation testing, otherwise -99 markers ''' #---------------------------------------------------------------- # Import what you need from statsmodels.sandbox.stats.multicomp import fdrcorrection0 as fdr import numpy as np from scipy.stats import linregress #---------------------------------------------------------------- # Set up an empty dictionary to save all these values regional_linregress_dict = {} #---------------------------------------------------------------- # Set up your covars_list # This should contain all the data for each covar as a different # element in the list # You're going to save the index at which wbic appears in the list # because you'll need to add the param for this covar to the # intercept value later to get a value for a woman (if male is # included in the covariates list) at wbic. # (You don't have to correct your intercept for being a woman # because the male covariate is coded as 0 for women, but you do # have to correct for wbic because there 0 represents CBU) covars_list = [] wbic_i = None for i, covar in enumerate(covars): covars_list += [df_x[covar].values] if covar == 'wbic': wbic_i = i #---------------------------------------------------------------- # Set up some empty arrays to contain: # m: slope of the regression line # c: intercept as x = 0 + the parameter estimate # for the wbic scanner location if passed # c14: intercept when x = 14 (c + 14 * m) # r: partial r # p: p from the ols regression (for x) # perm_p: p from the permutation test m_array = np.ones(len(names)) c_array = np.ones(len(names)) c14_array = np.ones(len(names)) r_array = np.ones(len(names)) p_array = np.ones(len(names)) perm_p_array = np.ones(len(names)) #---------------------------------------------------------------- # Merge the data frames together df_xy = df_x.merge(df_y, on='nspn_id', how='inner') #---------------------------------------------------------------- # Loop through all the regions and first regress out the # covariates and then record m, c, p and perm_p # for each region for i, roi in enumerate(names): results, perm_p = permutation_ols(df_xy['{}_x'.format(roi)].values, df_xy['{}_y'.format(roi)].values, covars_orig=covars_list, categorical=categorical, n_perm=n_perm) # Fill up your empty arrays with the useful values # from the OLS results #== Beta ========= m_array[i] = results.params['x'] #== Intercept ==== if wbic_i: wbic_param = results.params['c_{}'.format(wbic_i)] else: wbic_param = 0 c_array[i] = results.params['Intercept'] + wbic_param #== Int at 14 ==== c14_array[i] = c_array[i] + 14*m_array[i] #== Partial r ==== t = results.tvalues['x'] df_resid = results.df_resid if t < 0: direction = -1 else: direction = 1 r_array[i] = np.sqrt(t**2 / (t**2 + df_resid)) * direction #== p & perm_p === p_array[i] = results.pvalues['x'] perm_p_array[i] = perm_p #---------------------------------------------------------------- # Calculate the fdr p values p_fdr_array = fdr(p_array)[1] p_fdr_mask = fdr(p_array)[0] perm_p_fdr_array = fdr(perm_p_array)[1] perm_p_fdr_mask = fdr(perm_p_array)[0] #---------------------------------------------------------------- # Create masked versions of the slope array m_masked_p_array = np.copy(m_array) m_masked_p_array[p_array>0.05] = -99 m_masked_perm_p_array = np.copy(m_array) m_masked_perm_p_array[perm_p_array>0.05] = -99 m_masked_p_fdr_array = np.copy(m_array) m_masked_p_fdr_array[p_fdr_array>0.05] = -99 m_masked_perm_p_fdr_array = np.copy(m_array) m_masked_perm_p_fdr_array[perm_p_fdr_array>0.05] = -99 #---------------------------------------------------------------- # Now save each of these arrays into the dictionary regional_linregress_dict['m'] = m_array regional_linregress_dict['c'] = c_array regional_linregress_dict['c14'] = c14_array regional_linregress_dict['r'] = r_array regional_linregress_dict['p'] = p_array regional_linregress_dict['perm_p'] = perm_p_array regional_linregress_dict['p_fdr'] = p_fdr_array regional_linregress_dict['perm_p_fdr'] = m_array regional_linregress_dict['m_masked_p'] = m_masked_p_array regional_linregress_dict['m_masked_perm_p'] = m_masked_perm_p_array regional_linregress_dict['m_masked_p_fdr'] = m_masked_p_fdr_array regional_linregress_dict['m_masked_perm_p_fdr'] = m_masked_perm_p_fdr_array # Return the regional regression dictionary return regional_linregress_dict
def parcelwise_analysis(dirz,srch_str,varsheet,varlist): res = {} sigz = {} for direc in dirz: jnk,ref = os.path.split(direc) print 'working on directory %s'%(ref) print 'forming matrix' mapz = sorted(glob(os.path.join(direc,srch_str))) jnk = pandas.read_table(mapz[0],header=None) jnk.drop(jnk.columns[-1],axis=1,inplace=True) sz = len(jnk) mtx = np.full((sz,len(mapz)),np.nan) for i,map in enumerate(mapz): mdf = pandas.read_table(map,header=None) mdf.drop(mdf.columns[-1],axis=1,inplace=True) denz = mdf.sum(axis=1) for x,d in enumerate(denz.tolist()): mtx[x,i] = d print 'creating spreadsheet' if varsheet.split('.')[1][-3:] == 'csv': cdf = pandas.read_csv(varsheet) else: cdf = pandas.ExcelFile(varsheet).parse('Sheet1') if len(cdf) != mtx.shape[1]: raise IOError('input varsheet must have the same number of rows as there are text files in your directories') for v in varlist: if v not in cdf.columns.tolist(): raise IOError('all items in varlist must correspond to columns in varsheet') for j,sub in enumerate(cdf.index.tolist()): for i in range(len(mtx)): cdf.ix[sub,'p%s_dens'%(i)] = mtx[i,j] print 'running models' rdf = pandas.DataFrame(np.full((len(mtx),2),np.nan), columns =['t','p']) for i in range(len(mtx)): stmnt = build_statement('p%s_dens'%(i),varlist) lm = smf.ols(stmnt,data=cdf).fit() rdf.ix[i+1,'t'] = lm.tvalues[1] rdf.ix[i+1,'p'] = lm.pvalues[1] rdf.drop(rdf.index[0],axis=0,inplace=True) print 'correcting models' fdrtst = fdr(np.array(rdf[:]['p'].tolist())) fwetst = fwe(np.array(rdf[:]['p'].tolist())) for i in range(len(fdrtst[1])): rdf.ix[i+1,'fdr'] = fdrtst[1][i] for i in range(len(fwetst[1])): rdf.ix[i+1,'fwe'] = fwetst[1][i] res.update({ref: rdf}) sig = [] for parc in rdf.index.tolist(): if rdf.ix[parc,'fdr'] < 0.1 or rdf.ix[parc,'fwe'] < 0.1: sig.append((parc,rdf.ix[parc,'fdr'],rdf.ix[parc,'fwe'])) sigz.update({ref: sig}) return res,sigz