def correct_p_values(pvalues): """ apply Benjamini-Hochberg FDR correction to p-values """ # import the stats package stats = importr('stats') # collapse the pvalues nested list into a single list temp_p_values = [] for i in range(len(pvalues)): for j in range(len(pvalues[0])): temp_p_values.append(pvalues[i][j]) # adjust the p values from the collapsed list using the Benjamini-Hochberg correction p = stats.p_adjust(FloatVector(temp_p_values), method = 'BH') # rebuild the original list iter_p = iter(p) for i in range(len(pvalues)): for j in range(len(pvalues[0])): pvalues[i][j] = next(iter_p) # return the corrected p values return pvalues
def get_padjust(pvalue_list): stats = importr('stats') p_adjust = stats.p_adjust(FloatVector(pvalue_list), method='BH') return p_adjust
PA_list.append(IPA_lst) pvalue_list.append(IPA_lst[7]) return PA_list,pvalue_list def Get_output(filename,PA_list,padj_list): out_IPUI = open(filename,"w") first_line = ["SYMBOL","intron_rank","Terminal_exon","IPAtype",args.bamfile1.split("/")[-1].split(".")[0],args.bamfile2.split("/")[-1].split(".")[0],"diff","Pvalue","padj","change"] out_IPUI.writelines("\t".join(first_line)+"\n") for i in range(len(PA_list)): SYMBOL,intron_rank,Terminal_exon,IPAtype,IPUIctrl,IPUIcase,IPUIdiff,pvalue = PA_list[i] padj = padj_list[i] change = "NOT" if padj < 0.05 and IPUIdiff > 0.1: change = "UP" if padj < 0.05 and IPUIdiff < (-0.1): change = "DOWN" out_IPUI.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(SYMBOL,intron_rank,Terminal_exon,IPAtype,IPUIctrl,IPUIcase,IPUIdiff,pvalue,padj,change)) out_IPUI.close() pool = Pool(args.proc) input_tuple = list(zip(annot.keys(),[all_bamfiles]*len(annot))) result_list = pool.map(Get_IPAevent,input_tuple) PA_list,pvalue_list = Get_PAlist(result_list) stats = importr('stats') padj_list = stats.p_adjust(FloatVector(pvalue_list), method = 'BH') Get_output(args.outfile,PA_list,padj_list)
def shared_bait_feature(feature_table, bait_id_column, id_column, bh_correct=False): #kdrew: best to enforce ids as strings feature_table['bait_id_column_str'] = feature_table[bait_id_column].apply( str) #kdrew: merge table to itself to get pairs of proteins with same bait feature_shared_bait_table = feature_table.merge(feature_table, on='bait_id_column_str') print feature_shared_bait_table feature_shared_bait_table['gene_id1_str'] = feature_shared_bait_table[ id_column + '_x'].apply(str) feature_shared_bait_table['gene_id2_str'] = feature_shared_bait_table[ id_column + '_y'].apply(str) feature_shared_bait_table['frozenset_geneids'] = map( frozenset, feature_shared_bait_table[['gene_id1_str', 'gene_id2_str']].values) feature_shared_bait_table[ 'frozenset_geneids_str_order'] = feature_shared_bait_table[ 'frozenset_geneids'].apply(list).apply(sorted).apply(str) #kdrew: this way actually fails to deal with duplicate gene pairs properly, using above frozenset_geneids_str_order method ##kdrew: create set of id pairs (need set because merge generates duplicate gene pairs, also deals with order) #df_tmp = map(frozenset, feature_shared_bait_table[[args.id_column+'_x',args.id_column+'_y']].values) #feature_shared_bait_table['gene_id_set'] = df_tmp ##kdrew: generate tuple of set so groupby works, apparently cannot use cmp on sets #feature_shared_bait_table['gene_id_tup'] = feature_shared_bait_table['gene_id_set'].apply(tuple) ##kdrew: number of times pair is found (unique baits), 'k' in Hart etal 2007 #ks = feature_shared_bait_table.groupby('gene_id_tup')[args.bait_id_column].nunique() ##kdrew: number of times individual id is found (unique baits), 'm' and 'n' in Hart etal 2007 #ms = feature_shared_bait_table.groupby(args.id_column+'_x')[args.bait_id_column].nunique() ##kdrew: number of total experiments (unique baits), 'N' in Hart etal 2007 #N = feature_shared_bait_table[args.bait_id_column].nunique() #kdrew: number of times pair is found (unique baits), 'k' in Hart etal 2007 ks = feature_shared_bait_table.groupby( 'frozenset_geneids_str_order')['bait_id_column_str'].nunique() #kdrew: number of times individual id is found (unique baits), 'm' and 'n' in Hart etal 2007 ms = feature_shared_bait_table.groupby( 'gene_id1_str')['bait_id_column_str'].nunique() #kdrew: number of total experiments (unique baits), 'N' in Hart etal 2007 N = feature_shared_bait_table['bait_id_column_str'].nunique() #for gene_ids in bioplex_feature_shared_bait_table.gene_id_tup: output_dict = dict() output_dict['gene_id1'] = [] output_dict['gene_id2'] = [] output_dict['pair_count'] = [] output_dict['neg_ln_pval'] = [] output_dict['pval'] = [] print ks for gene_ids_str in ks.index: gene_ids_clean = gene_ids_str.translate(None, "[\'],") gene_ids = gene_ids_clean.split() if len(gene_ids) == 2: print gene_ids k = ks[gene_ids_str] m = ms[gene_ids[0]] n = ms[gene_ids[1]] #print k #print m #print n #p = stats.hypergeom.cdf(k, N, m, n) p = pval(k, n, m, N) neg_ln_p = -1.0 * math.log(p) #print "%s k:%s n:%s m:%s -ln(p):%s" % (gene_ids, k, m, n, neg_ln_p) output_dict['gene_id1'].append(gene_ids[0]) output_dict['gene_id2'].append(gene_ids[1]) output_dict['pair_count'].append(k) output_dict['neg_ln_pval'].append(neg_ln_p) output_dict['pval'].append(p) if bh_correct: stats = importr('stats') p_adjust = stats.p_adjust(FloatVector(output_dict['pval']), method='BH') output_dict['pval_corr'] = p_adjust output_dict['neg_ln_pval_corr'] = [ -1.0 * math.log(p) for p in p_adjust ] output_df = pd.DataFrame(output_dict) return output_df
def DaPars_Filtering(input_file, num_samples, num_group1, output_file): #cfg_file = 'CFIm25_Configure.txt' #Group1_Tophat_aligned_file,Group2_Tophat_aligned_file,output_directory,Annotated_3UTR_file,Output_result_file,Num_least_in_group1_local,Num_least_in_group2_local,Coverage_cutoff_local,FDR_cutoff_local,Fold_change_cutoff_local,PDUI_cutoff_local = parse_cfgfile(cfg_file) #input_file = 'CFIm25_KD_vs_Control_3UTR_All_prediction.txt' #input_file = 'Wagner_3UTR_New_Nov_5_2012_All_prediction.txt' #output_file = 'filtered.txt' #num_samples = 2 #num_group1 = 1 # if FDR_cutoff_local != '': # global FDR_cutoff # FDR_cutoff = FDR_cutoff_local # print FDR_cutoff # if PDUI_cutoff_local != '': # global PDUI_cutoff # PDUI_cutoff = PDUI_cutoff_local # print PDUI_cutoff output_write = open(output_file, 'w') num_line = 0 result_dict = {} All_P_values = [] Selected_events_id = [] All_mean_abundance = [] for line in open(input_file, 'r'): if num_line > 0: fields = line.strip('\n').split('\t') group1_coverages = np.zeros(2) group2_coverages = np.zeros(2) num_group1_pass = 0 group1_PDUIs = 0 for i in range(num_group1): curr_long = fields[4 + i * 3] curr_short = fields[5 + i * 3] if curr_long != 'NA': curr_long = float(curr_long) curr_short = float(curr_short) if curr_long + curr_short >= Coverage_cutoff: group1_PDUIs = group1_PDUIs + float(fields[6 + i * 3]) num_group1_pass += 1 group1_coverages[0] = group1_coverages[0] + curr_long group1_coverages[1] = group1_coverages[1] + curr_short else: fields[4 + i * 3] = 'NA' fields[5 + i * 3] = 'NA' fields[6 + i * 3] = 'NA' num_group2_pass = 0 group2_PDUIs = 0 for i in range(num_samples - num_group1): curr_long = fields[4 + (i + num_group1) * 3] curr_short = fields[5 + (i + num_group1) * 3] if curr_long != 'NA': curr_long = float(curr_long) curr_short = float(curr_short) if curr_long + curr_short >= Coverage_cutoff: group2_PDUIs = group2_PDUIs + float( fields[6 + (i + num_group1) * 3]) num_group2_pass += 1 group2_coverages[0] = group2_coverages[0] + curr_long group2_coverages[1] = group2_coverages[1] + curr_short else: fields[4 + (i + num_group1) * 3] = 'NA' fields[5 + (i + num_group1) * 3] = 'NA' fields[6 + (i + num_group1) * 3] = 'NA' if num_group1_pass >= Num_least_in_group1 and num_group2_pass >= Num_least_in_group2: Final_group_diff = str(group1_PDUIs / num_group1_pass - group2_PDUIs / num_group2_pass) All_mean_abundance.append([ group1_PDUIs / num_group1_pass, group2_PDUIs / num_group2_pass ]) fields[-1] = str(Final_group_diff) ratio_val, P_val = sp.stats.fisher_exact([ group1_coverages / num_group1_pass, group2_coverages / num_group2_pass ]) All_P_values.append(P_val) Selected_events_id.append(fields[0]) #print P_val #print ratio_val else: fields[-1] = 'NA' result_dict[fields[0]] = fields else: first_line = line.strip('\n').split('\t') num_line += 1 ##Filtering stats = importr('stats') All_p_adjust = stats.p_adjust(FloatVector(All_P_values), method='BH') first_line.insert(-1, 'Group_A_Mean_PDUI') first_line.insert(-1, 'Group_B_Mean_PDUI') first_line.extend(['P_val', 'adjusted.P_val', 'Pass_Filter']) output_write.writelines('\t'.join(first_line) + '\n') for curr_event_id in result_dict: mean_PDUI_group1 = 'NA' mean_PDUI_group2 = 'NA' curr_P_val = 'NA' curr_FDR_val = 'NA' Pass_filter = 'N' curr_fields = result_dict[curr_event_id] if curr_event_id in Selected_events_id: sel_ind = Selected_events_id.index(curr_event_id) curr_P_val = str(All_P_values[sel_ind]) curr_FDR_val = str(All_p_adjust[sel_ind]) mean_PDUI_group1 = All_mean_abundance[sel_ind][0] mean_PDUI_group2 = All_mean_abundance[sel_ind][1] if float(curr_FDR_val) <= FDR_cutoff and abs(float( curr_fields[-1])) >= PDUI_cutoff and abs( math.log((mean_PDUI_group1 + 1e-5) / (mean_PDUI_group2 + 1e-5), 2)) >= Fold_change_cutoff: Pass_filter = 'Y' curr_fields.insert(-1, str(mean_PDUI_group1)) curr_fields.insert(-1, str(mean_PDUI_group2)) curr_fields.append(curr_P_val) curr_fields.append(curr_FDR_val) curr_fields.append(Pass_filter) output_write.writelines('\t'.join(curr_fields) + '\n') output_write.close()
def volcano(df, group1: list, group2: list, fc=1, p_value=0.05, str1="grp1", str2="grp2", pair=False, adjust=True): """ :param df: np.log2 matrix data after replacing zero with np.nan :param group1: :param group2: :param fc: :param p_value: :param str1: :param str2: :param pair: :param adjust: :return: """ # TODO:param,test 'pair',simplify list = group1 + group2 columns = df.columns[list] df = pd.DataFrame(df, columns=columns).T df = df.dropna(axis=1, how='all') pd.set_option('mode.chained_assignment', None) df['index'] = df.index df['index'][group1] = 'grp1' df['index'][group2] = 'grp2' df.index = df['index'] df = df.drop('index', axis=1) data = df.applymap(lambda x: 2**x if type(x) == float else np.nan) m = np.nanmin(data.min().values) * 0.8 data.fillna(m, inplace=True) dff = data.T dff.columns = columns data = data.groupby(data.index).agg(np.mean).T dff['fd'] = np.log2(data['grp1'] / data['grp2']) m = np.nanmin(df.min().values) * 0.8 df.fillna(m, inplace=True) from scipy import stats if pair: x = stats.ttest_rel(df[df.index == 'grp1'], df[df.index == 'grp2']) else: x = stats.ttest_ind(df[df.index == 'grp1'], df[df.index == 'grp2'], equal_var=False) dff['sig'] = 'normal' dff['p_value'] = x.pvalue try: from rpy2.robjects.packages import importr from rpy2.robjects.vectors import FloatVector stats = importr('stats') dff['p_adjust_value'] = stats.p_adjust(FloatVector(dff['p_value']), method='BH') except ImportError as e: print("R doesn't work\nplease install rpy2") return None # Benjamini and Hochberg(BH)FDR # TODO:p_adjust in python # m =dff['p_value'].count() # dff['p_rank'] = dff['p_value'].rank(ascending=True) # dff['p_adjust_value'] = dff['p_value'] * (m / dff['p_rank']) # dff['p_k'] = 0.05*dff['p_rank'] / m # min_rank = min(dff[dff['p_adjust_value'] > dff['p_k']]['p_rank']) # dff[dff['p_rank'] >min_rank]['p_adjust_value'] = dff['p_value'] # # dff=dff.drop(['p_rank','p_k'],axis=1) if adjust: dff['P'] = dff['p_adjust_value'] y_text = '-log10 ( adjust p )' else: dff['P'] = dff['p_value'] y_text = '-log10 ( p_value )' dff.loc[(dff['fd'] > fc) & (dff['P'] < p_value), 'sig'] = 'up' dff.loc[(dff['fd'] < -1 * fc) & (dff['P'] < p_value), 'sig'] = 'down' # dff.to_csv('../test/all.csv') title = str1 + '_' + str2 x_text = 'log2 (fold change)' fig, ax = fig_ax(title, x_text, y_text) ax = sns.scatterplot(x='fd', y=-np.log10(dff['P']), hue='sig', markers='O', hue_order=('down', 'normal', 'up'), palette=("#377EB8", "grey", "#E41A1C"), data=dff, legend=False) args = { 'color': 'black', 'linestyle': '--', 'linewidth': 0.8, 'alpha': 0.4 } plt.axhline(y=-1 * np.log10(p_value), **args) plt.axvline(x=-1 * fc, **args) plt.axvline(x=fc, **args) plt.savefig('../test/volcano.pdf', dpi=200) dff[dff['sig'] == 'up'].drop('sig', axis=1).to_csv('../test/up.csv') dff[dff['sig'] == 'down'].drop('sig', axis=1).to_csv('../test/down.csv') plt.show()
def DaPars_Filtering(input_file, num_samples, num_group1, output_file): # cfg_file = 'CFIm25_Configure.txt' # Group1_Tophat_aligned_file,Group2_Tophat_aligned_file,output_directory,Annotated_3UTR_file,Output_result_file,Num_least_in_group1_local,Num_least_in_group2_local,Coverage_cutoff_local,FDR_cutoff_local,Fold_change_cutoff_local,PDUI_cutoff_local = parse_cfgfile(cfg_file) # input_file = 'CFIm25_KD_vs_Control_3UTR_All_prediction.txt' # input_file = 'Wagner_3UTR_New_Nov_5_2012_All_prediction.txt' # output_file = 'filtered.txt' # num_samples = 2 # num_group1 = 1 # if FDR_cutoff_local != '': # global FDR_cutoff # FDR_cutoff = FDR_cutoff_local # print FDR_cutoff # if PDUI_cutoff_local != '': # global PDUI_cutoff # PDUI_cutoff = PDUI_cutoff_local # print PDUI_cutoff output_write = open(output_file, "w") num_line = 0 result_dict = {} All_P_values = [] Selected_events_id = [] All_mean_abundance = [] for line in open(input_file, "r"): if num_line > 0: fields = line.strip("\n").split("\t") group1_coverages = np.zeros(2) group2_coverages = np.zeros(2) num_group1_pass = 0 group1_PDUIs = 0 for i in range(num_group1): curr_long = fields[4 + i * 3] curr_short = fields[5 + i * 3] if curr_long != "NA": curr_long = float(curr_long) curr_short = float(curr_short) if curr_long + curr_short >= Coverage_cutoff: group1_PDUIs = group1_PDUIs + float(fields[6 + i * 3]) num_group1_pass += 1 group1_coverages[0] = group1_coverages[0] + curr_long group1_coverages[1] = group1_coverages[1] + curr_short else: fields[4 + i * 3] = "NA" fields[5 + i * 3] = "NA" fields[6 + i * 3] = "NA" num_group2_pass = 0 group2_PDUIs = 0 for i in range(num_samples - num_group1): curr_long = fields[4 + (i + num_group1) * 3] curr_short = fields[5 + (i + num_group1) * 3] if curr_long != "NA": curr_long = float(curr_long) curr_short = float(curr_short) if curr_long + curr_short >= Coverage_cutoff: group2_PDUIs = group2_PDUIs + float(fields[6 + (i + num_group1) * 3]) num_group2_pass += 1 group2_coverages[0] = group2_coverages[0] + curr_long group2_coverages[1] = group2_coverages[1] + curr_short else: fields[4 + (i + num_group1) * 3] = "NA" fields[5 + (i + num_group1) * 3] = "NA" fields[6 + (i + num_group1) * 3] = "NA" if num_group1_pass >= Num_least_in_group1 and num_group2_pass >= Num_least_in_group2: Final_group_diff = str(group1_PDUIs / num_group1_pass - group2_PDUIs / num_group2_pass) All_mean_abundance.append([group1_PDUIs / num_group1_pass, group2_PDUIs / num_group2_pass]) fields[-1] = str(Final_group_diff) ratio_val, P_val = sp.stats.fisher_exact( [group1_coverages / num_group1_pass, group2_coverages / num_group2_pass] ) All_P_values.append(P_val) Selected_events_id.append(fields[0]) # print P_val # print ratio_val else: fields[-1] = "NA" result_dict[fields[0]] = fields else: first_line = line.strip("\n").split("\t") num_line += 1 ##Filtering stats = importr("stats") All_p_adjust = stats.p_adjust(FloatVector(All_P_values), method="BH") first_line.insert(-1, "Group_A_Mean_PDUI") first_line.insert(-1, "Group_B_Mean_PDUI") first_line.extend(["P_val", "adjusted.P_val", "Pass_Filter"]) output_write.writelines("\t".join(first_line) + "\n") for curr_event_id in result_dict: mean_PDUI_group1 = "NA" mean_PDUI_group2 = "NA" curr_P_val = "NA" curr_FDR_val = "NA" Pass_filter = "N" curr_fields = result_dict[curr_event_id] if curr_event_id in Selected_events_id: sel_ind = Selected_events_id.index(curr_event_id) curr_P_val = str(All_P_values[sel_ind]) curr_FDR_val = str(All_p_adjust[sel_ind]) mean_PDUI_group1 = All_mean_abundance[sel_ind][0] mean_PDUI_group2 = All_mean_abundance[sel_ind][1] if ( float(curr_FDR_val) <= FDR_cutoff and abs(float(curr_fields[-1])) >= PDUI_cutoff and abs(math.log((mean_PDUI_group1 + 1e-5) / (mean_PDUI_group2 + 1e-5), 2)) >= Fold_change_cutoff ): Pass_filter = "Y" curr_fields.insert(-1, str(mean_PDUI_group1)) curr_fields.insert(-1, str(mean_PDUI_group2)) curr_fields.append(curr_P_val) curr_fields.append(curr_FDR_val) curr_fields.append(Pass_filter) output_write.writelines("\t".join(curr_fields) + "\n") output_write.close()
# spearman p calc based on two tailed t-test def spearmanp(r,n): tstat=r*numpy.sqrt((n-2)/(1-r**2)) return t.cdf(-abs(tstat),n-2)*2 # read in the data df=pandas.read_table(path_of_file,index_col=False) df.rename(columns={old_name : 'splitter'}, inplace=True) #df=pandas.read_table("~/Desktop/temp_df.txt",index_col=False) splitters=numpy.unique(df.splitter) df_final=pandas.DataFrame(columns=["otus","variable","value","p.value","splitter"]) for numb in range(0,(len(splitters))): df_sub=df.loc[(df.splitter==splitters[numb])] df_data_only=df_sub.drop(df_sub.columns[[range(0,last_metadata_column)]],axis=1) df_corr_matrix=df_data_only.corr(method="spearman") df_corr_matrix["otus"]=df_corr_matrix.index #melt dataframe but maintain indices now called otus df_melt=pandas.melt(df_corr_matrix,id_vars="otus") # remove NAs or NaNs which are result of non-existent otus (all 0 values) df_melt=df_melt[numpy.isfinite(df_melt.value)] df_melt['pvalue']=spearmanp(df_melt.value,df_sub.shape[0]) df_melt['fdr'] = stats.p_adjust(FloatVector(df_melt.pvalue), method = 'fdr') df_melt_positive= df_melt[df_melt['value'] > 0] df_melt_corrected = df_melt_positive[df_melt_positive['fdr'] < 0.05 ] df_melt_corrected['splitter']=splitters[numb] df_final=df_final.append(df_melt_corrected,ignore_index=True) df_final.to_csv((path_to_write+splitters[numb]+"final_co-occurrence_results.csv"),index=False) #write the file # df_final.to_csv(path_to_write,index=False)
import sys import pandas import scipy import numpy from scipy import stats from scipy.stats import t from rpy2.robjects.packages import importr from rpy2.robjects.vectors import FloatVector path_of_file=sys.argv[1] old_name=str(sys.argv[2]) cutoff=float(sys.argv[3]) path_to_write=sys.argv[4] stats=importr('stats') df=pandas.read_csv(path_of_file,index_col=False) df.rename(columns={old_name: 'pvalue'}, inplace=True) df['fdr'] = stats.p_adjust(FloatVector(df.pvalue), method = 'fdr') df_sub=df[df['fdr'] < cutoff] df_sub.to_csv(path_to_write,index=False)
def CallRegion(wigs, refmap, genome_size_path, output, alias=None, process=8): """ :param wig: a dictionary contains wig file paths. key is group name, value is the list of wig file paths in the group :param refmap: the path for the reference map :param callvariant: whether to call variants expression level :param alias: a map contains group - samples alias for output table :return: variant df and region df """ # Load refmap with open(refmap, 'rb') as f: region_map = pickle.load(f) f.close() for key, value in wigs.items(): new_wig_objs = [] for wig in value: cur_wig = Wig(wig, genome_size_path) new_wig_objs.append(cur_wig) wigs[key] = new_wig_objs rownames_region = [region.id for region in region_map] rownames_variant = [ variant.id for region in region_map for variant in region.variants ] dfs_region_error = pd.DataFrame(index=rownames_region) dfs_variant = pd.DataFrame(index=rownames_variant) dfs_region = pd.DataFrame(index=rownames_region) groupnames = defaultdict(list) for key, value in wigs.items(): # print wigs for i in range(len(value)): cur_wig = value[i] colname = key + '_' + cur_wig.file_name if alias is None else alias[ key][i] # print colname groupnames[key].append(colname) region, region_error, variant = CallVariants( cur_wig, region_map, process) df_region_error = pd.DataFrame( region_error, columns=['region_id', colname + "_error"]) df_region_error = df_region_error.set_index(['region_id']) df_region = pd.DataFrame(region, columns=['region_id', colname]) df_region = df_region.set_index(['region_id']) df_variant = pd.DataFrame( variant, columns=['variant_id', colname, 'region_id']) df_variant = df_variant.set_index(['variant_id']) dfs_region_error = dfs_region_error.join(df_region_error) if 'region_id' in dfs_variant.columns: del dfs_variant['region_id'] dfs_variant = dfs_variant.join(df_variant) dfs_region = dfs_region.join(df_region) for key in groupnames.keys(): dfs_variant[key] = dfs_variant[groupnames[key]].mean(axis=1) dfs_region[key] = dfs_region[groupnames[key]].mean(axis=1) min_variant = min(dfs_variant[[key for key in groupnames.keys()]].min()) min_region = min(dfs_region[[key for key in groupnames.keys()]].min()) stats = importr('stats') for i in range(len(groupnames.keys())): key1 = groupnames.keys()[i] for j in range(i + 1, len(groupnames.keys())): key2 = groupnames.keys()[j] dfs_variant[key1 + "_vs_" + key2 + "_log2FC"] = np.log2( (dfs_variant[key1] + min_variant) / (dfs_variant[key2] + min_variant)) dfs_variant[key1 + '_vs_' + key2 + "_P"] = 1 - scipy.stats.poisson.cdf( dfs_variant[[key1, key2]].max(axis=1), dfs_variant[[key1, key2]].min(axis=1)) dfs_variant[key1 + '_vs_' + key2 + "_log10P"] = np.log10( dfs_variant[key1 + '_vs_' + key2 + "_P"]) dfs_variant[key1 + '_vs_' + key2 + "_FDR"] = stats.p_adjust( FloatVector(dfs_variant[key1 + '_vs_' + key2 + "_P"].tolist()), method='BH') dfs_region[key1 + "_vs_" + key2 + "_log2FC"] = np.log2( (dfs_region[key1] + min_region) / (dfs_region[key2] + min_region)) dfs_region[key1 + '_vs_' + key2 + "_P"] = 1 - scipy.stats.poisson.cdf( dfs_region[[key1, key2]].max(axis=1), dfs_region[[key1, key2]].min(axis=1)) dfs_region[key1 + '_vs_' + key2 + "_log10P"] = np.log10( dfs_region[key1 + '_vs_' + key2 + "_P"]) dfs_region[key1 + '_vs_' + key2 + "_FDR"] = stats.p_adjust( FloatVector(dfs_region[key1 + '_vs_' + key2 + "_P"].tolist()), method='BH') dfs_region_error.to_csv(output + '_region_error.csv') dfs_variant.to_csv(output + '_variant.csv') dfs_region.to_csv(output + '_region.csv') df_region_correlation = DiffVariant(region_map, dfs_variant, groupnames) return dfs_variant, dfs_region, df_region_correlation
alpha = float(sys.argv[3]) partial_kruskal = partial(calc_kruskal, sample_num_l=sample_num_l, alpha=alpha) pool = Pool(processes=int(sys.argv[4])) result = pool.map(partial_kruskal,[row for row in reader]) p_val_list=[] for elem in result: p_val_list += [float(elem[-2])] stats = importr('stats') p_adjust = stats.p_adjust(FloatVector(p_val_list), method = 'fdr') # rej, pval_corr = smm.multipletests(p_val_list, alpha=alpha, method=sys.argv[6])[:2] for index in range(len(result)): result[index] = result[index] + [`p_adjust[index]`] with open(sys.argv[5], 'w') as f_out: f_out.write(header_line) f_out.writelines('\t'.join(i) + '\n' for i in result) # with open(sys.argv[5], 'r') as correc: # correc_reader = csv.reader(correc, delimiter="\t") # correc_header_line = next(correc) # correc_header_line = correc_header_line.rstrip() + '\tp.adj'
alpha = float(sys.argv[3]) partial_kruskal = partial(calc_kruskal, sample_num_l=sample_num_l, alpha=alpha) pool = Pool(processes=int(sys.argv[4])) result = pool.map(partial_kruskal,[row for row in reader]) p_val_list=[] for elem in result: p_val_list += [float(elem[-1])] stats = importr('stats') p_adjust = stats.p_adjust(FloatVector(p_val_list), method = sys.argv[6]) # rej, pval_corr = smm.multipletests(p_val_list, alpha=alpha, method=sys.argv[6])[:2] for index in range(len(result)): result[index] = result[index] + [`p_adjust[index]`] with open(sys.argv[5], 'w') as f_out: f_out.write(header_line) f_out.writelines('\t'.join(i) + '\n' for i in result) # with open(sys.argv[5], 'r') as correc: # correc_reader = csv.reader(correc, delimiter="\t") # correc_header_line = next(correc) # correc_header_line = correc_header_line.rstrip() + '\tp.adj'