def snp_count(nucs, species, recomp15, recomp16): SNPs = {x: 0 for x in ('A','T','C','G')} maxSNP = "" maxSNP_value = 0 countahead = 0 indel = "" consensus_num = len(recomp16.findall(nucs)) for m in recomp15.finditer(nucs): prev = str(m.group(1)) SNP = str(m.group(2)) if prev != "None": # nucleotide belongs to an indel countahead = int(prev[1]) - 1 indel = str(prev) + str(SNP) elif countahead > 0: # continue adding nucleotides 1 at a time to indel indel += str(SNP) countahead -= 1 if countahead == 0: if indel in SNPs: SNPs[indel] += 1 else: SNPs[indel] = 1 else: # SNP not part of an indel SNPs[SNP] += 1 SNPcount = sum([v for v in SNPs.values()]) maxSNP = max([k for k, v in SNPs.items()],key=lambda k: SNPs[k]) maxSNP_value = SNPs[maxSNP] totalusedreads = consensus_num + maxSNP_value totalreads = consensus_num + SNPcount oddsratio = pvalue = nullvalue = 0 mapping = "-" if species == "t": # Fisher's Exact Test for correctly mapped reads in a duplicated region on A. thaliana. Testing for 50/50 Con:SNP distribution. # # Actual Ideal (for 50/50) # Consensus a b # SNPs c d if consensus_num == totalreads and maxSNP_value == 0: return(mapping) # Time-saver else: oddsratio, pvalue = fisher_exact([[consensus_num, math.ceil(totalreads*0.5)], [maxSNP_value, math.ceil(totalreads*0.5)]]) elif species == "l": # Fisher's Exact Test for ped reads on A. lyrata. Testing for 0/100 Con:SNP distribution. # # Actual Ideal (for 0/100) # Consensus a 0 # SNPs c d if consensus_num == 0: # Scipy's implementation of Fisher's Exact Test will error if this is 0; a variable in scipy becomes infinity, and it isn't handled properly. Therefore, increment all variables in the test by 1 consensus_num += 1 maxSNP_value += 1 totalreads += 1 nullvalue += 1 if consensus_num == totalreads and maxSNP_value == 0: return(mapping) # Time-saver else: oddsratio, pvalue = fisher_exact([[consensus_num, nullvalue], [maxSNP_value, totalreads]]) if pvalue < 0: pvalue = 0 # An error sometimes occurs for ratios enormously different from the ideal where p-value is slightly negative. Probably a floating point error if pvalue >= 0.10: # To a 90% confidence level, cannot reject the hypothesis that the observed SNP frequency is the same as 0/100 (A. lyrata) or 50/50 (A. thaliana) consensus/SNP mapping = maxSNP return(mapping)
def getFisher(distPop, racePop, all_punishments, group_punishments): # I don't know if this is a valid way to report the Fisher's exact test statistic, but the idea is that if getFisher returns a # positive number over .95, there's a 95% chance that the group's better-than-average treatment is not due to chance. # If it returns a number under -.95, there's a 95% chance that the group's worse-than-average treatment is not due to chance. # I think it should be easier to create a color scale to show the scores on a map this way. # The getFisher function assumes wrongly that everyone can have only one punishment (of each type). If the number of # punishments exceeds the number of kids, it reduces the number of punishments (and assumes wrongly that every # kid has been punished) But maybe the results are still close enough to correct to use for scaling? """ >>> getFisher(20, 5, 20, 10) 0.904604 >>> getFisher(20, 0, 20, 0) (None, None) """ if max(racePop, group_punishments) == 0 or None: return None, None elif all_punishments == 0 or None: return 1, 0 else: oddsratio, pvalueG = stats.fisher_exact([[racePop, max(distPop - racePop, 0)], [group_punishments, max(all_punishments - group_punishments, 0)]], alternative='greater') oddsratio, pvalueL = stats.fisher_exact([[racePop, max(distPop - racePop, 0)], [group_punishments, max(all_punishments - group_punishments, 0)]], alternative='less') if pvalueL < pvalueG: pv = 1 - pvalueL else: pv = pvalueG - 1 pv = Decimal(pv) pv = pv.quantize(Decimal('0.000001')) return float(pv)
def contam_contig(nmapped_sink, nmapped_source, contam_libs, KS_THRESHOLD=0.001, P_RATIO=0.0001) : """ determine if a contig is contaminated """ # count # of sources nsource = 0 prop=0. N=0 mssg='' for key in contam_libs: mssg+=key+',' logging.info(mssg) max=0 # record the library from that contributed most weight to prop for lib_scr in nmapped_source: #print lib_scr if lib_scr in contam_libs: # found a source lib nsource += 1 f=float(contam_libs[lib_scr][0])/float(contam_libs[lib_scr][1])*nmapped_source[lib_scr].n_mapped_reads prop+=f #proportion N+=nmapped_source[lib_scr].n_mapped_reads mssg = '%s %d' % (lib_scr, N) logging.info(mssg) if f > max: max=f lib=lib_scr if N!=0: prop /= N else: return False mssg='max lib %s, # of sources %d, P is %f, nmap_sink is %d, nmap_src is %d' % (lib, nsource, prop, nmapped_sink[1], N) logging.info(mssg) mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop)) logging.info(mssg) mssg='%g ~ %g, p-value %g' % (prop, nmapped_sink[1]/float(N), stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop))) logging.info(mssg) if nsource == 1: mssg='fisher exact %g' % stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1] else: mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop)) logging.info(mssg) if nsource > 1 and stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop)) > 0.05: nsource=1 if nsource==0 or (nsource==1 and nmapped_source[lib].similar < KS_THRESHOLD) \ or (nsource==1 and stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1] < P_RATIO)\ or (stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop)) < P_RATIO): # not a contam return False slist=[] for lib_scr in nmapped_source: if lib_scr in contam_libs: slist.append(lib_scr) logging.info(','.join(slist)) return True
def fisher(donors, gene_1_mutated, other_mutated, cooc): a = donors - gene_1_mutated - other_mutated + cooc # both wt (we subtracted the overlap twice) b = gene_1_mutated - cooc # p53 mutated adn rpl5 wt c = other_mutated - cooc # rpl5 mutated and p53 wt d = cooc # both mutated [odds, pval_lt] = stats.fisher_exact([[a, b], [c, d]], "less") [odds, pval_gt] = stats.fisher_exact([[a, b], [c, d]], "greater") return pval_lt, pval_gt
def target_enrichment_peak2peak(peak1_table, peak2_table, motif_table): """Perform the enrichment analysis on two samples Args: peak_table: pandas dataframe, motifscan result table on sample1 rnd_table: pandas dataframe, motifscan result table on sample2 motif_table: pandas dataframe, motif information table Returns: motif_table: pandas dataframe, table containing both motif information and fisher exact test statistics """ n_motif = len(motif_table) n_peak1 = len(peak1_table) n_peak2 = len(peak2_table) fold_change = np.zeros(n_motif) enrich_pvalue = np.zeros(n_motif) deplete_pvalue = np.zeros(n_motif) oddsratio = np.ones(n_motif) pvalue_corrected = np.ones(n_motif) peak1_tarnum = np.zeros(n_motif) peak2_tarnum = np.zeros(n_motif) # print pd.Index([i for i in peak1_table.columns if re.search(r'\.tarnum',i)]) peak1_tarnum_table = peak1_table[ pd.Index([i for i in peak1_table.columns if re.search(r'\.tarnum', i)])] peak2_tarnum_table = peak2_table[ pd.Index([i for i in peak2_table.columns if re.search(r'\.tarnum', i)])] for mti, motif_name in zip(range(n_motif), motif_table['name']): peak1_tarnum[mti] = len( [i for i in peak1_tarnum_table['%s.tarnum' % motif_name] if i > 0]) peak2_tarnum[mti] = len( [i for i in peak2_tarnum_table['%s.tarnum' % motif_name] if i > 0]) if peak1_tarnum[mti] != 0 and peak2_tarnum[mti] != 0: fold_change[mti] = float(peak1_tarnum[mti] * n_peak2) / ( peak2_tarnum[mti] * n_peak1) else: fold_change[mti] = 'NaN' table = [[peak1_tarnum[mti], n_peak1 - peak1_tarnum[mti]], [peak2_tarnum[mti], n_peak2 - peak2_tarnum[mti]]] oddsratio[mti], enrich_pvalue[mti] = stats.fisher_exact(table, 'greater') oddsratio[mti], deplete_pvalue[mti] = stats.fisher_exact(table, 'less') pvalue_corrected[mti] = min(min(deplete_pvalue[mti], enrich_pvalue[mti]) * n_motif, 1) motif_table['peak1_target_number'] = peak1_tarnum motif_table['peak2_target_number'] = peak2_tarnum motif_table['fold_change'] = fold_change motif_table['enrich_pvalue'] = enrich_pvalue motif_table['deplete_pvalue'] = deplete_pvalue motif_table['oddsratio'] = oddsratio motif_table['pvalue_corrected'] = pvalue_corrected motif_table.sort('enrich_pvalue', inplace=True) return motif_table
def statistic_analysis(np_snp_info,np_feature_snp,np_label_classifyProgress,np_label_classifyPhenotype): ### proportion np_proportion = np.empty([np_snp_info.shape[0],np_snp_info.shape[1]],dtype='float') np_proportion = np.average(np_feature_snp, axis=0).reshape(np_snp_info.shape[0],np_snp_info.shape[1]) ### get 2X2 matrix np_2_2_matrix_classifyProgress = np.empty([np_snp_info.shape[0],4],dtype='float') np_2_2_matrix_classifyPhenotype = np.empty([np_snp_info.shape[0],4],dtype='float') for idxSNP in range(0,np_snp_info.shape[0]): for idxSample in range(0,np_feature_snp.shape[0]): if np_label_classifyProgress[idxSample] == 0: np_2_2_matrix_classifyProgress[idxSNP,0] = np_2_2_matrix_classifyProgress[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyProgress[idxSNP,2] = np_2_2_matrix_classifyProgress[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 else: np_2_2_matrix_classifyProgress[idxSNP,1] = np_2_2_matrix_classifyProgress[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyProgress[idxSNP,3] = np_2_2_matrix_classifyProgress[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 if np_label_classifyPhenotype[idxSample] == 0: np_2_2_matrix_classifyPhenotype[idxSNP,0] = np_2_2_matrix_classifyPhenotype[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyPhenotype[idxSNP,2] = np_2_2_matrix_classifyPhenotype[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 else: np_2_2_matrix_classifyPhenotype[idxSNP,1] = np_2_2_matrix_classifyPhenotype[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyPhenotype[idxSNP,3] = np_2_2_matrix_classifyPhenotype[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 ### chi-square; fisher; oddsratio np_chi2 = np.empty([np_snp_info.shape[0],2],dtype='float') np_fisher = np.empty([np_snp_info.shape[0],2],dtype='float') np_oddsratio = np.empty([np_snp_info.shape[0],2],dtype='float') for idxSNP in range(0,np_snp_info.shape[0]): np_this_2_2_matrix = np_2_2_matrix_classifyProgress[idxSNP,:].reshape(2,2) print np_this_2_2_matrix chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False) np_chi2[idxSNP,0] = p oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix) np_fisher[idxSNP,0] = pvalue np_oddsratio[idxSNP,0] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1]) np_this_2_2_matrix = np_2_2_matrix_classifyPhenotype[idxSNP,:].reshape(2,2) chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False) np_chi2[idxSNP,1] = p oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix) np_fisher[idxSNP,1] = pvalue np_oddsratio[idxSNP,1] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1]) #proportion(AA:AB:BB); ClassifyProgress(Chi2,Fisher,OddsRatio); ; ClassifyPhenotype(Chi2,Fisher,OddsRatio) np_statistic_result = np.empty([np_snp_info.shape[0],9],dtype='float') np_statistic_result[:,:3] = np_proportion np_statistic_result[:,3] = np_chi2[:,0] np_statistic_result[:,4] = np_fisher[:,0] np_statistic_result[:,5] = np_oddsratio[:,0] np_statistic_result[:,6] = np_chi2[:,1] np_statistic_result[:,7] = np_fisher[:,1] np_statistic_result[:,8] = np_oddsratio[:,1] return np_statistic_result
def fisher_test_file(infile, mini_site): #print 'in fisher function' #print infile ofile = open('%s.fisher_test.txt' %(infile), 'w') with open (infile, 'r') as filehd: for line in filehd: line = line.rstrip() if len(line) > 2: unit = re.split(r'\t',line) #print unit #CG if not 'NA' in [unit[3], unit[4], unit[15], unit[16]]: if int(unit[5]) <= mini_site or int(unit[17]) <= mini_site: print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[3], unit[4], unit[15], unit[16], 'NA') else: c1 = int(unit[3]) mc1 = int(unit[4]) c2 = int(unit[15]) mc2 = int(unit[16]) oddsratio, pvalue = fisher_exact([[c1, mc1], [c2, mc2]]) print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(c1), str(mc1), str(c2), str(mc2), str(pvalue)) else: print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[3], unit[4], unit[15], unit[16], 'NA') #CHG if not 'NA' in [unit[6], unit[7], unit[18], unit[19]]: if int(unit[8]) <= mini_site or int(unit[20]) <= mini_site: print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[6], unit[7], unit[18], unit[19], 'NA') else: chg1 = int(unit[6]) mchg1 = int(unit[7]) chg2 = int(unit[18]) mchg2 = int(unit[19]) oddsratio, pvalue = fisher_exact([[chg1, mchg1], [chg2, mchg2]]) print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(chg1), str(mchg1), str(chg2), str(mchg2), str(pvalue)) else: print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[6], unit[7], unit[18], unit[19], 'NA') #CHH if not 'NA' in [unit[9], unit[10], unit[21], unit[22]]: if int(unit[11]) <= mini_site or int(unit[23]) <= mini_site: print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[9], unit[10], unit[21], unit[22], 'NA') else: chh1 = int(unit[9]) mchh1 = int(unit[10]) chh2 = int(unit[21]) mchh2 = int(unit[22]) oddsratio, pvalue = fisher_exact([[chh1, mchh1], [chh2, mchh2]]) print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(chh1), str(mchh1), str(chh2), str(mchh2), str(pvalue)) else: print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[9], unit[10], unit[21], unit[22], 'NA') ofile.close() return 1
def get_sorted_fisher_dct(clus_genes, go_dct): ''' Returns a dictionary, where keys=GO terms, values=p-values of Fisher's test GO enrichment for the set of genes in the cluster. ''' # Keys are GO terms, and values are the enrichment p-values. fisher_dct = {} for go_label in go_dct: go_genes = set(go_dct[go_label]) # # Skip bad GO terms. # if len(go_genes) > MAX_GO_SIZE or len(go_genes) < MIN_GO_SIZE: # continue # Compute the four sets for Fisher's test. clus_and_go = len(clus_genes.intersection(go_genes)) clus_not_go = len(clus_genes.difference(go_genes)) go_not_clus = len(go_genes.difference(clus_genes)) neither = len(gene_universe) - len(go_genes.union(clus_genes)) # Run Fisher's test. f_table = ([[clus_and_go, clus_not_go], [go_not_clus, neither]]) o_r, p_value = fisher_exact(f_table, alternative='greater') # Handle overflow issues. p_value = max(p_value, 1e-300) fisher_dct[go_label] = p_value return sorted(fisher_dct.items(), key=operator.itemgetter(1))
def fishers_exact_plot(data, condition1, condition2): """ Perform a Fisher's exact test to compare to binary columns Parameters ---------- data: Pandas dataframe Dataframe to retrieve information from condition1: str First binary column compare condition2: str Second binary column to compare """ sb.factorplot( x=condition1, y=condition2, kind='bar', data=data ) count_table = pd.crosstab(data[condition1], data[condition2]) print(count_table) oddsratio, pvalue = fisher_exact(count_table) print("Fisher's Exact Test: OR: {}, p-value={}".format(oddsratio, pvalue)) return (oddsratio, pvalue)
def fishers(ocean, conn, interval, zone, zoneExpected, zoneObserved, globalCount): """ The null hypothesis is that the relative proportions of one variable are independent of the second variable. For example, if you counted the number of male and female mice in two barns, the null hypothesis would be that the proportion of male mice is the same in the two barns. http://udel.edu/~mcdonald/statfishers.html inputs: zone expected, zone observed, not zone expected, not zone observed in zone not in zone observed a b expected c d """ notZoneExpected = globalCount - zoneExpected notZoneObserved = globalCount - zoneObserved if zoneExpected < 1 and zoneObserved == 0: print("** values too low") else: odds, pval = fisher_exact(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]])) fishers = robjects.r['fisher.test'] res_r = fishers(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]])) r_p = res_r[0][0] r_odds = res_r[2][0] sql = "insert into fisherResults (ocean, zone, period, fisher, sig) values('" + ocean + "', '" + zone.get('name') + "', '" + str(interval) + "', " + formatOdds(r_odds) + ", " + str(r_p) + ")" utils.executeMysql_All(conn, sql) print("** Fishers Exact: %s: odds: %s, r_odds: %s, r_p: %s, p: %s" % (zone.get('name'), formatOdds(odds), r_odds, r_p, pval))
def phyper_at_fpr(fg_vals, bg_vals, fpr=0.01): """ Computes the hypergeometric p-value at a specific FPR (default 1%). Parameters ---------- fg_vals : array_like The list of values for the positive set. bg_vals : array_like The list of values for the negative set. fpr : float, optional The FPR (between 0.0 and 1.0). Returns ------- fraction : float The fraction positives at the specified FPR. """ fg_vals = np.array(fg_vals) s = scoreatpercentile(bg_vals, 100 - fpr * 100) table = [ [sum(fg_vals >= s), sum(bg_vals >= s)], [sum(fg_vals < s), sum(bg_vals < s)], ] return fisher_exact(table, alternative="greater")[1]
def calculate_differential_methylation_fisher_exact(self, weighted = False): sum_meth_control = 0 sum_meth_affected = 0 sum_cov_control = 0 sum_cov_affected = 0 for cpg in self.cpgs: if weighted: sum_meth_control += cpg.weighted_methylation_control sum_meth_affected += cpg.weighted_methylation_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected else: sum_meth_control += cpg.meth_control sum_meth_affected += cpg.meth_affected sum_cov_control += cpg.cov_control sum_cov_affected += cpg.cov_affected control = sum_meth_control / sum_cov_control affected = sum_meth_affected / sum_cov_affected control_methylated = sum_cov_control * control / 100 control_unmethylated = sum_cov_control - control_methylated affected_methylated = sum_cov_affected * affected / 100 affected_unmethylated = sum_cov_affected - affected_methylated try: #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/ p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated) pvalue = p.two_tail except: oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided') return pvalue
def calc(self, c_table): # two-sided p = stats.fisher_exact(c_table)[1] html = "<b>P-value</b> = %s" % format(p, 'g') tex = "$P = %s$" % format(p, 'g') res = dict(p=p, X=", ".join(self.x_cats), Y=", ".join(self.y_cats), report=dict(tex=tex, html=html)) return res
def calc_qval_dbl(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500): """ :param study_n: Integer (number of ANs from sample frequency) :param pop_n: Integer (number of ANs from background frequency = sample freq.) :param pop: :param assoc: :param term_pop: :param obo_dag: :param T: :return: """ distribution = [] for i in range(T): new_study = random.sample(pop, study_n) # add pop and study new_term_study = count_terms(new_study, assoc, obo_dag)[0] #!!! smallest_p = 1 for term, study_count in list(new_term_study.items()): pop_count = term_pop[term] a = study_count b = study_n - study_count c = pop_count d = pop_n - pop_count p_two_tail = stats.fisher_exact([[a, b], [c, d]], alternative='greater')[1] if p_two_tail < smallest_p: smallest_p = p_two_tail distribution.append(smallest_p) if i % 10 == 0: print("Sample {0} / {1}: p-value {2}".\ format(i, T, smallest_p), file=sys.stderr) return distribution
def fip(envoSize, clusterSize, tp): fn = envoSize - tp fp = clusterSize - tp tn = n - (tp + fn + fp) fi = fisher_exact([[tp,fn], [fp, tn]])[1] fscore = prf(tp, fp, fn)[-1] return fi, fscore
def computePropTest( data ): mC = np.asarray( data[['mC.r1', 'mC.r2']] ) tC = np.asarray( data[['t.r1','t.r2']] ) uC = tC - mC #chi2, p, dof, ex = stats.chi2_contingency( np.array([mC,uC]), correction=True) odr, p = stats.fisher_exact( np.array([mC,uC]) ) return p
def call_variant(self): """ mirrors AminoAcidCaller::CallVariants() in https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp For each position (that has sufficient coverage), do Fisher exact test w/ correction if p-val < threshold, then store it. Stores results in self.variant as: self.variant[position] = desc list of (base, count). NOTE: base must be either all in lower case (which means - strand) or call upper case (+ strand). If - strand and ('a', 10), it means the ref base in A on the + strand, and the transcript should be T on the - strand. Only positions with more than the ref base is stored. """ for pos in self.positions_to_call: r = self.record_by_pos[pos] alt_variant = [] for base, count in r.clean_counts.most_common()[1:]: assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels exp = r.clean_cov * self.err_sub odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') pval *= self.number_of_tests if pval < self.pval_cutoff: # store variant if below cutoff alt_variant.append((base, count)) if len(alt_variant) > 0: # only record this variant if there's at least two haps self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant self.ref_base[pos] = r.ref
def MK_test(SNPs, test_mode): ''' (dict, str) -> dict Take a dict of gene : [PN, PS, DN, DS] pairs and a string fisher or G_test and a return a new dict with gene : [PN, PS, DN, DS, p-val] pairs with PN and DN being respectively replacement polymorphisms and divergence and PS and DS being respectively synonymous polymorphisms and divergence and p-val being the p-value of the contingency test using either Fisher's two-sided exact test or the G-test with Yate's correction ''' # create new dict MK = {} # loop over genes in dict for gene in SNPs: # initialize list with PN, PS polym = [SNPs[gene][0], SNPs[gene][1]] # initialize list with DN, DS diverg = [SNPs[gene][2], SNPs[gene][3]] # perform the MK test according to fisher 2-tailed or G-test if test_mode == 'fisher': # get the p-value P = stats.fisher_exact([polym, diverg])[1] elif test_mode == 'G_test': P = stats.chi2_contingency([polym, diverg], lambda_ = 'log-likelihood')[1] # add p-val to list MK[gene] = list(SNPs[gene]) MK[gene].append(P) return MK
def scoreGenePair(gene_symbol_pair,variantList): variantList=list(variantList) patientsID_dictionnary=patientsID_dictionnary_b.value patientsID_split_index=patientsID_split_index_b.value score=0 if len(variantList)==2: (genes,variantList1)=variantList[0] (genes,variantList2)=variantList[1] gene1=genes[0] gene2=genes[1] variantList1=list(variantList1) variantList2=list(variantList2) genoSum1=getGenotypeVectorByGene(gene1,variantList1,patientsID_dictionnary,patientsID_split_index) genoSum2=getGenotypeVectorByGene(gene2,variantList2,patientsID_dictionnary,patientsID_split_index) genoSum=[int(x>0 and y>0) for x,y in zip(genoSum1,genoSum2)] sumCase=float(sum([int(x>0) for x in genoSum[0:patientsID_split_index]])) ratioCase=sumCase/patientsID_split_index sumControl=float(sum([int(x>0) for x in genoSum[(patientsID_split_index+1):len(patientsID_dictionnary)]])) ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index) score=ratioCase-ratioControl pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1] if score>0: return (gene_symbol_pair,((gene1,gene2),score,pvalue,ratioCase,ratioControl,sumCase,sumControl))
def enrichment_analysis(regions, data_to_scatter, synnonsyn, reference, minor_af): '''Enrichment of nonsynonymous mutations at globally variable but intrapatient conserved sites''' from scipy.stats import fisher_exact E = {} for region in regions: # returns count matrix [syn/nonsyn, low/high fitness, low/high entropy (><0.1)] E[region] = scatter_vs_entropy(region, data_to_scatter, synnonsyn, reference, xlabel='fitness cost', xlim=(1e-4, 1.2), enrichment_thresholds = (0.0, 0.03, 10.0)) print('NonSyn enrichment among variable sites with high fitness costs') with open('../data/fitness_pooled/enrichment_st_'+args.subtype+'.tsv', 'w') as ofile: ofile.write('\t'.join(['region', 'nonsyn-low cost', 'nonsyn-large cost', 'syn-low cost', 'syn-large cost', 'OR', 'invOR', 'pval'])+'\n') for region, ctable in E.iteritems(): print(region) print('non-syn:\n', ctable[0]) print('syn:\n', ctable[1]) print('nonsyn/syn among diverse:\n', ctable[:,:,1]) OR, pval = fisher_exact(ctable[:,:,1]) print('odds ratio:', OR) ofile.write('\t'.join([region]+map(str, [ctable[0,0,1], ctable[0,1,1], ctable[1,0,1], ctable[1,1,1], OR, np.round(1.0/OR,2), pval]))+'\n') ctable = np.sum(E.values(), axis=0) ofile.write('\t'.join(['all']+map(str, [ctable[0,0,1], ctable[0,1,1], ctable[1,0,1], ctable[1,1,1], OR, np.round(1.0/OR,2), pval]))+'\n') return E
def fisherExact(): '''Fisher's Exact Test: Data are taken from Altman, Table 10.14 Spectacle wearing among juvenile delinquensts and non-delinquents who failed a vision test Spectecle wearers: 1 delinquent, 5 non-delinquents non-spectacle wearers: 8 delinquents, 2 non-delinquents ''' # Enter the data obs = np.array([[1,5], [8,2]]) # --- >>> START stats <<< --- # Calculate the Fisher Exact Test # Note that by default, the option "alternative='two-sided'" is set; # other options are 'less' or 'greater'. fisher_result = stats.fisher_exact(obs) # --- >>> STOP stats <<< --- # Print the result print('\nFISHER --------------------------------------------------------') print(('The probability of obtaining a distribution at least as extreme ' + 'as the one that was actually observed, assuming that the null ' + 'hypothesis is true, is: {0:5.3f}.'.format(fisher_result[1]))) return fisher_result
def main (contingency_table): """ Calcula estadisticas de una tabal de contingencia 2x2 """ SRS_types = set([]) tables = {} for row in csv.reader(open(contingency_table), delimiter = '\t'): ID, non_can, can = row SRS, tag = ID.split("_") SRS_types.add(SRS) tables[ID] = [int(non_can), int(can)] for srs in SRS_types: table = [] table.append(tables[srs + "_YES"]) table.append(tables[srs + "_NO"]) obs = np.array(table) chi2, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(obs, correction=False) chi2_yates, chi2_yates_pvalue, chi2_yates_dof, chi2_yates_ex = chi2_contingency(obs, correction=True) fisher_oddsratio, fisher_pvalue = stats.fisher_exact(table) # print srs, table, fisher_oddsratio, fisher_pvalue, chi2, chi2_pvalue, chi2_dof, chi2_ex print srs, fisher_oddsratio, log(fisher_oddsratio, 2), fisher_pvalue, chi2, chi2_pvalue, chi2_yates, chi2_yates_pvalue
def get_vocab(self, vectorizer, input_text, input_scores): train_mat = vectorizer.transform(input_text) input_score_med = np.median(input_scores) new_scores = [0 if i <= input_score_med else 1 for i in input_scores] pvalues = [] for i in range(0, train_mat.shape[1]): lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0] good_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 1]] bad_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 0]] good_lcol_present = len(good_lcol[good_lcol > 0]) good_lcol_missing = len(good_lcol[good_lcol == 0]) bad_lcol_present = len(bad_lcol[bad_lcol > 0]) bad_lcol_missing = len(bad_lcol[bad_lcol == 0]) oddsratio, pval = fisher_exact( [[good_lcol_present, bad_lcol_present], [good_lcol_missing, bad_lcol_missing]] ) pvalues.append(pval) col_inds = range(0, train_mat.shape[1]) p_frame = np.array([col_inds, pvalues]).transpose() p_frame = p_frame[p_frame[:, 1].argsort()] rows = p_frame.shape[0] selection = self.max_features if rows < selection: selection = rows getVar = lambda searchList, ind: [searchList[int(i)] for i in ind] vocab = getVar(vectorizer.get_feature_names(), p_frame[:, 0][-selection:]) return vocab
def scoreVariantPair(variantIDpair,value_GenotypeListPair): genotypeListPair=list(value_GenotypeListPair) patientsID_dictionnary=patientsID_dictionnary_b.value patientsID_split_index=patientsID_split_index_b.value score=0 if len(genotypeListPair)==2: (variantID,genotypeList1)=genotypeListPair[0] (variantID,genotypeList2)=genotypeListPair[1] variantID1=variantID[0] variantID2=variantID[1] genotypeList1=list(genotypeList1) genotypeList2=list(genotypeList2) genotypeVector1=getGenotypeVector(genotypeList1) genotypeVector2=getGenotypeVector(genotypeList2) genotypeVector=[int(x>0 and y>0) for x,y in zip(genotypeVector1,genotypeVector2)] sumCase=float(sum([int(x>0) for x in genotypeVector[0:patientsID_split_index]])) ratioCase=sumCase/patientsID_split_index sumControl=float(sum([int(x>0) for x in genotypeVector[(patientsID_split_index+1):len(patientsID_dictionnary)]])) ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index) score=ratioCase-ratioControl pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1] #if score>0: return (variantIDpair,((variantID1,variantID2),score,pvalue,ratioCase,ratioControl,sumCase,sumControl))
def scoreGene(block): block=list(block) lenb=len(block) scores=[] patientsID_dictionnary=patientsID_dictionnary_b.value patientsID_split_index=patientsID_split_index_b.value if lenb>0: for i in range(0,lenb): listLoadBlock=block[i] sumCase=float(sum([int(x>0) for x in listLoadBlock[1][0:patientsID_split_index]])) sumControl=float(sum([int(x>0) for x in listLoadBlock[1][patientsID_split_index:len(patientsID_dictionnary)]])) ratioCase=sumCase/patientsID_split_index ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index) score=ratioCase-ratioControl pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1] #pvalue=ttest_ind(genotypeVectorByGene[0:patientsID_split_index],genotypeVectorByGene[patientsID_split_index:len(patientsID_dictionnary)])[1]/2 if score>0: scores.append((listLoadBlock[0],(score,pvalue,ratioCase,ratioControl,sumCase,sumControl))) return scores
def scoreGenePair(block1,block2): block1=list(block1) lenb1=len(block1) lenb2=len(block2) scores=[] patientsID_dictionnary=patientsID_dictionnary_b.value patientsID_split_index=patientsID_split_index_b.value if lenb1>0 and lenb2>0: for i in range(0,lenb1): for j in range(0,lenb2): listLoadBlock1=block1[i] listLoadBlock2=block2[j] if listLoadBlock1[0]>listLoadBlock2[0]: genoSum=[int(x>0 and y>0) for x,y in zip(listLoadBlock1[1],listLoadBlock2[1])] sumCase=float(sum([int(x>0) for x in genoSum[0:patientsID_split_index]])) sumControl=float(sum([int(x>0) for x in genoSum[(patientsID_split_index):len(patientsID_dictionnary)]])) ratioCase=sumCase/patientsID_split_index ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index) score=ratioCase-ratioControl pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1] if score>0: scores.append(((listLoadBlock1[0],listLoadBlock2[0]),((listLoadBlock1[0],listLoadBlock2[0]),score,pvalue,ratioCase,ratioControl,sumCase,sumControl))) return scores
def chi_mode(data,depth,low=lcut,alpha=chi,f=freq): result=dict() plus=data['A'][0]+data['T'][0]+data['G'][0]+data['C'][0] minus=data['A'][1]+data['T'][1]+data['G'][1]+data['C'][1] for key in ['A','T','G','C']: if data[key][0] >= low[0]*depth and data[key][1] >=low[1]*depth: ndep=data[key][2] frequency=ndep/float(data['cover']) if frequency >= f: #add chi square test: if frequency > 0.5: result[key]=frequency else: a=data[key][0] b=data[key][1] c=plus-data[key][0] d=minus-data[key][1] least=sorted([a,b,c,d])[0] table=[[a,b],[c,d]] if least < 5: pvalue=stats.fisher_exact(table)[1] else: pvalue=stats.chi2_contingency(table)[1] if pvalue > alpha: result[key]=frequency return result
def get_confusion_matrix_fisher_significance(table, alternative='two-sided'): """ Returns the value of fisher_exact test on table. Parameters ---------- table : array_like of ints A 2x2 contingency table. Elements should be non-negative integers. alternative : {'two-sided', 'less', 'greater'}, optional Which alternative hypothesis to the null hypothesis the test uses. Default is 'two-sided'. Returns ------- oddsratio : float This is prior odds ratio and not a posterior estimate. p_value : float P-value, the probability of obtaining a distribution at least as extreme as the one that was actually observed, assuming that the null hypothesis is true. """ from scipy.stats import fisher_exact return fisher_exact(table, alternative)
def fisherExact(g1r1, g1r2, g2r1, g2r2, nTails=1): #returns a p-value #Params: group 1 result 1, group 1 result 2, etc. p = 0 if nTails == 1: pGreater = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]], alternative='greater')[1] pLess = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]], alternative='less')[1] p = pLess if pGreater < pLess: p = pGreater else: #default is two tailed p = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]])[1] return p
def check_diff(tl, bg, total_num = 100015): # total_num = 100015 with clusters, total_num = 37391 without them tl_mean = np.mean(tl) bg_mean = np.mean(bg) is_different = False oddsratio, pvalue = stats.fisher_exact([[tl_mean, total_num - tl_mean], [bg_mean, total_num - bg_mean]]) if pvalue < 0.00005: is_different = True return is_different
if gene not in subject: c.append(gene) for gene in all_genes: if gene not in subject and gene not in gene_set: d.append(gene) return a, b, c, d subject = create_subject() gene_sets = create_gene_sets() all_genes = create_all_genes() for set_name in gene_sets: experiment = [] gene_set = create_gene_set(set_name) title = gene_set[0] description = gene_set[1] genes = gene_set[2:] a, b, c, d = contingency_matrix(subject, genes, all_genes) experiment.append(title) # 0 experiment.append(description) # 1 experiment.append([[a, b], [c, d]]) # 2 experiment.append([[len(a), len(b)], [len(c), len(d)]]) # 3 oddsratio, p_value = fisher_exact(experiment[3]) experiment.append(p_value) # 4 #print('Genes in subject and in gene set:', len(a)) #print('Genes in subject and not in gene set:', len(b)) #print('Genes not in subject and in gene set:', len(c)) #print('Genes not in subject and not in gene set:', len(d)) experiments.append(experiment)
len(disease_module_genelist), ",".join(disease_module_snplist), ",".join(disease_module_genelist) ] temp += [",".join(module_geneset)] #module gene set fisher_test_matrix = [ [ len(disease_module_genelist), num_eGene - len(disease_module_genelist) ], [ module_nodes.shape[0] - len(disease_module_genelist), num_gg_nodeset - num_eGene - module_nodes.shape[0] + len(disease_module_genelist) ] ] oddsratio, pvalue = stats.fisher_exact(fisher_test_matrix) # temp += [pvalue, oddsratio, len(set(module_geneset) & PD_familial), ",".join(list(set(module_geneset) & PD_familial))] temp += [pvalue, oddsratio] print(label, ' : ', fisher_test_matrix) result = pd.concat([result, pd.DataFrame([temp], columns=result.columns)]) #insert row fdr = statsmodels.stats.multitest.multipletests(result['p-value'], method='fdr_bh', is_sorted=False) result['FDR'] = fdr[1] ##save result result.sort_values(['p-value'], ascending=[1], inplace=True) #sort result.insert(0, 'label', list(range(1, result.shape[0] + 1, 1))) #insert col
def fisher_enrich(sample, annotations, depletions=True, background=None, restrict=False, min_fold=None, min_overlap=None, fdr=None): """ Given a sample of elements ( members ), and an annotation mapping [term][member] ( nested dictionary ) Compute significant enrichments among the members : If not specified, background is all annotated terms : If not specified, analysis is "open" ( terms can appear in sample/annotations not in background ) : Default is not FDR corrected for multiple tests : Default is positive enrichment of 1.1+, pvalue < 0.05, and overlap must be at least two members """ if not background: background = generate_background(annotations) if restrict: # restrict sample and annotation space to background; may result in empty annotations ( which are removed ) sample = [k for k in sample if k in background] annotations = { term: {k: 1 for k in annotations[term] if k in background} for term in annotations } annotations = { term: dictMembers for term, dictMembers in annotations.items() if len(dictMembers) > 0 } # calculate results ( enrichment stats for each term in the annotations map ) results = [] for term, members in annotations.items(): # overlap between members with term and members of sample overlap = list(set(sample).__and__(set(members.keys()))) # counts count_overlap = len(overlap) count_background = len(background) count_sample = len(sample) count_term = len(members) count_sample_not_term = count_sample - count_overlap count_term_not_sample = count_term - count_overlap count_remainder = count_background - count_overlap - count_term_not_sample - count_sample_not_term # frequencies freq_sample = count_overlap / float(count_sample) freq_background = count_term / float(count_background) # fold enrichment fold_enrichment = freq_sample / freq_background # contingency table for fisher exact table = [[count_overlap, count_sample_not_term], [count_term_not_sample, count_remainder]] # calculate pvalue and store results pvalue = fisher_exact(table)[1] # [0] is an odds ratio; do not want results.append([term, len(overlap), fold_enrichment, pvalue]) # convert pvalues to qvalues pvalues = [r[-1] for r in results] qvalues = pvalues2qvalues(pvalues) # attach qvalues for i, result in enumerate(results): results[i].append(qvalues[i]) # filter results results2 = [] for result in sorted(results, key=lambda x: x[-1]): include = True term, overlap, fe, pvalue, qvalue = result if min_overlap is not None and overlap < min_overlap: include = False if min_fold is not None and 1 / float(min_fold) < fe < min_fold: include = False if not depletions and fe < 1: include = False if fdr is not None and qvalue > fdr: include = False if include: results2.append(result) return results2
def np_fisher(a,b,c,d): oddsratio, pvalue = stats.fisher_exact([[a, b], [c, d]]) return(oddsratio,pvalue)
def do_fisher_exact(x): return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater')
def mutations_association(args): # position to handle (will process interval of 250 positions ahead) input_x = args.position freqs = pd.read_csv(args.freqs_file, sep="\t") freqs = freqs[freqs['Pos'] == np.round( freqs['Pos'] )] #remove insertions #TODO- maoz- why not simply ref != '-' if (input_x < freqs["Pos"].min()) or (input_x > freqs["Pos"].max()): sys.exit() # blast files (all .fasta.blast files joined together) all_mappings = pd.read_csv(args.blast_output, names=["read_id", "start", "end"], sep="\t") # summary of all observed mutations from ref, including mappings to origin reads all_mutations = pd.read_csv( args.mutations_all, names=["pos", "read_id", "mutant", "read_positions"], sep="\t" ) #TODO- what mutations are included in mutations_all? is there a threshold? cons = freqs[(freqs["Rank"] == 0) & (freqs["Base"] != "-")] cons.insert(0, "pos", pd.to_numeric(cons.loc[:, "Pos"])) all_mutations = pd.merge(all_mutations, cons[["pos", "Ref"]], on="pos") # adding Ref\Cons to all_mutations #remove C>A and G>T #all_mutations = all_mutations[~(((all_mutations["Ref"]=="C")&(all_mutations["mutant"]=="A")) | ((all_mutations["Ref"]=="G")&(all_mutations["mutant"]=="T")))] #variants=all_mutations["pos"].unique() variants_combinations = range(input_x + 1, input_x + 2) # x-> (x+1,x+2) instead of (x+1,x+250) for y in variants_combinations: #x=pair[0] x = input_x #y=pair[1] maps_for_two_pos = all_mappings[(all_mappings["start"] <= x) & ( all_mappings["end"] >= y)] # reads surrounding the [x,y] interval merge_read_id = pd.DataFrame( {"read_id": maps_for_two_pos["read_id"].unique()}) merge_x = all_mutations[all_mutations["pos"] == x][["pos", "read_id"]] merged = pd.merge(merge_read_id, merge_x, on="read_id", how="left") merge_y = all_mutations[all_mutations["pos"] == y][["pos", "read_id"]] merged = pd.merge(merged, merge_y, on="read_id", how="left") x_label = "pos_" + str(x) y_label = "pos_" + str(y) merged[x_label] = np.where(merged["pos_x"] == x, 1, 0) merged[y_label] = np.where(merged["pos_y"] == y, 1, 0) ct = pd.crosstab(merged[x_label], merged[y_label]) if ct.shape == (2, 2): fisher_test = fisher_exact( ct, alternative='greater') ## TODO- review fisher's test print('\t'.join([ str(x) for x in [ x, y, fisher_test[0], fisher_test[1], ct[1][1] * 1.0 / (ct[0][0] + ct[0][1] + ct[1][0] + ct[1][1]) ] ])) else: print('\t'.join([ str(x) for x in [x, y, 0.0, 1.0, 0.0] ])) # statistic ('odds ratio'), p-value, *shared_freq*
counttot = 0 for i in dic: countid += dic[i][intid][0] counttot += dic[i][intid][1] return [countid, counttot] ## runs tests and produces list of pvals corresponding to ids list ## cor_pval is a list of pvals corresponding to ids list corrected for multiple tests pval = [] tests = {} for i in ids: test = [pm_dict[i], return_sums(i, names)] tests[i] = [stats.fisher_exact(test)[1], stats.fisher_exact(test)[0]] pval.append(tests[i][0]) cor_pval = pcorrect.multipletests(pval, alpha=0.05, method='fdr_bh')[1] species = [] for i in range(1, len(sys.argv) - 1): species.append(sys.argv[i].split('/')[-1][:3]) species.remove('Pma') outfile.write('InterPro ID\tPma\t' + '\t'.join(species) + '\tpvalue\tcorrected P value\tup/down ratio\n') st = 0
ref = items[2].upper() depth = int(items[3]) if depth > 0: match = items[4] quality = items[5] count, pos_n, in_base, del_base = translate_bases(ref, depth, match) print(count) print(pos_n) print(in_base) print(del_base) num_ref = count[gt_ref] + count[gt_ref.lower()] num_alt = count[gt_alt] + count[gt_alt.lower()] print(num_ref, num_alt) ci_lower, ci_upper = wilson_binom_interval(num_alt, num_alt + num_ref, alpha=0.05) print(ci_lower, ci_upper) import scipy.stats as stats num_ref_for = count[gt_ref] num_ref_rev = count[gt_ref.lower()] num_alt_for = count[gt_alt] num_alt_rev = count[gt_alt.lower()] oddsratio, pvalue = stats.fisher_exact([[num_ref_for, num_ref_rev], [num_alt_for, num_alt_rev]]) print(oddsratio, pvalue) quality_ref, quality_alt = translate_qualities(gt_ref, gt_alt, pos_n, quality) print(quality_ref, quality_alt) scores = compute_table(gt_ref, gt_alt, quality_ref, quality_alt) print(scores)
odds_good = [] p_val_bad = [] odds_bad = [] p_val_pre = [] odds_pre = [] genes_listy = [] for row in matrix_gene_counts: row = list(np.array(row).reshape(-1, )) genes_listy.append(row[0]) absent_good = good_total - int( row[1]) #total_good_indiduals - individuals with variant gene present absent_bad = bad_total - int(row[2]) absent_pre = pre_total - int(row[3]) absent_control = control_total - int(row[4]) oddsratio, pvalue = stats.fisher_exact( [[int(row[1]), int(row[2])], [absent_good, absent_bad]] ) #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control, , ‘less’, ‘greater’}, p_values_good_bad.append(pvalue) odds_good_bad.append(oddsratio) oddsratio, pvalue = stats.fisher_exact( [[int(row[1]), int(row[4])], [absent_good, absent_control]] ) #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control p_val_good.append(pvalue) odds_good.append(oddsratio) oddsratio, pvalue = stats.fisher_exact( [[int(row[2]), int(row[4])], [absent_bad, absent_control]] ) #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control p_val_bad.append(pvalue) odds_bad.append(oddsratio) oddsratio, pvalue = stats.fisher_exact( [[int(row[3]), int(row[4])], [absent_pre, absent_control]]
def producer(queue, arrs): for comAr in arrs: com, ar = comAr p = fisher_exact(ar, 'greater') queue.put([com, ar[0, 0], p])
chromosome_length=np.loadtxt(open("../Data/chromosome_file.txt",'r'),delimiter="\t",dtype=np.str) chr_number=np.loadtxt(open("../Data/chr_distribution_number.txt",'r'),delimiter="\t",dtype=np.str,skiprows=1) chr_length_list=chromosome_length[:,2].astype(np.float) variant_number_list=chr_number[:,3].astype(np.float) # 1-pos 2-neg 3-all chr_length_total=np.sum(chr_length_list,axis=0) variant_number_total=np.sum(variant_number_list,axis=0) p_value_g_list=[] p_value_l_list=[] for x in range(24): a=variant_number_list[x] b=chr_length_list[x]-a c=variant_number_total-a d=chr_length_total-variant_number_total-b pvalue_greater=fisher_exact([[a,b],[c,d]],alternative="greater")[1] pvalue_less=fisher_exact([[a,b],[c,d]],alternative="less")[1] p_value_g_list.append(pvalue_greater) p_value_l_list.append(pvalue_less) q_value_g_list=bh_qvalues(p_value_g_list) q_value_l_list=bh_qvalues(p_value_l_list) out=[] out.append(["chromosome","variant number","p-value-g","FDR-g","p-value-l","FDR-l"]) for x in range(24): out.append([chr_number[x][0],chr_number[x][3],p_value_g_list[x],q_value_g_list[x],p_value_l_list[x],q_value_l_list[x]]) np.savetxt("all_distribution.txt",np.array(out),delimiter="\t",fmt="%s") print("Finished.")
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): assert (len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') # return FishersResult(odds_ratio, p_value) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided') dof = None test_result = TestResult(name=fisher_exact_name, test_statistic=odds_ratio, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, x=x, y=y) return test_result
def gse_marker_handle(gse_data, organism, cell_marker_dict, odds_ratio_threshold=2, p_value_threshold=0.01, method='greater'): """Fisher exact text for every cluster""" assert method in {'two-sided', 'less', 'greater'} all_gse_data = gse_data for count, gse in enumerate(all_gse_data, 1): marker_genes_file = os.path.join(gse, 'marker_genes.csv') if os.path.isdir(gse) and not os.path.isfile(marker_genes_file): text = f'Missing: {marker_genes_file}!' print(text) else: if organism not in cell_marker_dict: text = f'{gse}: Did not find marker genes.txt of {organism} in cell_marker!' print(text) continue text = f'Handling: {gse} {organism} ({count}/{len(all_gse_data)})' print(text) with open(marker_genes_file, 'r', encoding='utf8') as f: marker_genes_data = pd.read_csv(f, sep=',') item_list = [] all_marker = cell_marker_dict['all'][organism] # all marker for cluster, data in marker_genes_data.groupby('cluster'): cluster_marker = set( data['gene']) & all_marker # marker in one cluster n_all_marker = len(all_marker) n_cluster_marker = len(cluster_marker) if n_cluster_marker == 0: continue cluster_marker_prop = n_cluster_marker / n_all_marker # proportion of cluster marker in all marker for cell_type, cell_type_marker in cell_marker_dict[ organism].items(): n_cell_type_marker = len( cell_type_marker) # marker in one cell type # expected hit in random condition n_expected_hit = cluster_marker_prop * n_cell_type_marker hit_genes = cluster_marker & cell_type_marker n_hit = len(hit_genes) odds_ratio = n_hit / n_expected_hit if odds_ratio > odds_ratio_threshold: n_non_hit_cell_type_marker = n_cell_type_marker - n_hit n_non_hit_cluster_marker = n_cluster_marker - n_hit n_other_marker = n_all_marker - n_hit - n_non_hit_cell_type_marker - n_non_hit_cluster_marker table = [[n_other_marker, n_non_hit_cell_type_marker], [n_non_hit_cluster_marker, n_hit]] p_value = stats.fisher_exact(table, method)[1] if p_value < p_value_threshold: item = [ cluster, organism, cell_type[0], cell_type[1], n_all_marker, n_cluster_marker, n_cell_type_marker, n_hit, n_expected_hit, odds_ratio, p_value, '|'.join(hit_genes) ] item_list.append(item) if item_list: item_data = pd.DataFrame(item_list) columns = [ 'cluster', 'organism', 'tissueType', 'cellName', 'n_all_marker', 'n_cluster_marker', 'n_cell_type_marker', 'n_hit', 'n_expected_hit', 'odds_ratio', 'p_value', 'hits' ] item_data.columns = columns item_data.sort_values(by=['cluster', 'p_value'], inplace=True) cells_type_file = os.path.join(gse, 'cells_type.csv') with open(cells_type_file, 'w', encoding='utf8') as f: item_data.to_csv(f, index=False) text = f'Finished: {gse}' print(text) else: text = f'Not cluster can be marked to cell type: {gse}!' print(text)
def rfet(container, relative_risk=1, min_events=1, decision_metric='fdr', decision_thres=0.05, mid_pval=False, expected_method='mantel-haentzel', method_alpha=1): ''' Calculate the proportional reporting ratio. Arguments: container: A DataContainer object produced by the convert() function from data_prep.py relative_risk (int/float): The relative risk value min_events: The min number of AE reports to be considered a signal decision_metric (str): The metric used for detecting signals: {fdr = false detection rate, signals = number of signals, rank = ranking statistic} decision_thres (float): The min thres value for the decision_metric expected_method: The method of calculating the expected counts for the disproportionality analysis. method_alpha: If the expected_method is negative-binomial, this parameter is the alpha parameter of the distribution. ''' DATA = container.data N = container.N if min_events > 1: DATA = DATA[DATA.events >= min_events] n11 = np.asarray(DATA['events'], dtype=np.float64) n1j = np.asarray(DATA['product_aes'], dtype=np.float64) ni1 = np.asarray(DATA['count_across_brands'], dtype=np.float64) num_cell = len(n11) expected = calculate_expected(N, n1j, ni1, n11, expected_method, method_alpha) n10 = n1j - n11 n01 = ni1 - n11 + 1e-7 n00 = N - (n11 + n10 + n01) log_rfet = np.log(n11 * n00 / (n10 * n01)) pval_fish_uni = np.empty((num_cell)) for p in range(num_cell): table = [[n11[p], n10[p]], [n01[p], n00[p]]] pval_fish_uni[p] = fisher_exact(table, alternative='greater')[1] if mid_pval: for p in range(num_cell): pval_fish_uni[p] = (pval_fish_uni[p] - .5 * hypergeom.pmf( n11[p], n11[p] + n10[p], n11[p] + n01[p], n10[p] + n00[p])) pval_uni = pval_fish_uni pval_uni[pval_uni > 1] = 1 pval_uni[pval_uni < 0] = 0 RankStat = pval_uni with warnings.catch_warnings(): warnings.simplefilter("ignore") results = lbe(2 * np.minimum(pval_uni, 1 - pval_uni)) pi_c = results[1] fdr = (pi_c * np.sort(pval_uni[pval_uni <= .5]) / (np.arange(1, (pval_uni <= .5).sum() + 1) / num_cell)) fdr = np.concatenate((fdr, (pi_c / (2 * np.arange( ((pval_uni <= .5).sum()), num_cell) / num_cell) + 1 - (pval_uni <= .5).sum() / np.arange( (pval_uni <= .5).sum(), num_cell))), axis=None) FDR = np.minimum(fdr, np.ones((len(fdr), ))) if decision_metric == 'fdr': num_signals = (FDR <= decision_thres).sum() elif decision_metric == 'signals': num_signals = min(decision_thres, num_cell) elif decision_metric == 'rank': num_signals = (RankStat <= decision_thres).sum() RC = Container() RC.all_signals = pd.DataFrame( { 'Product': DATA['product_name'].values, 'Adverse Event': DATA['ae_name'].values, 'Count': n11, 'Expected Count': expected, 'p_value': RankStat, 'PRR': np.exp(log_rfet), 'product margin': n1j, 'event margin': ni1, 'FDR': FDR }, index=np.arange(len(n11))).sort_values(by=['p_value']) RC.signals = RC.all_signals.iloc[0:num_signals, ] RC.num_signals = num_signals return RC
if oro3p[1] == 0: continue # if ctable[0][1] > ctable[0][0]: # this junction has more counts in the mutant totalnumsites += 1 if chrom[0] == '-': closest_annotated = find_wiggle(threeprime1, annotmin, maxdist=150) if closest_annotated == 150: nocanonical += 1 diff = min(closest_annotated, int(oro3p[0]) - int(threeprime1)) entries += [[ chrom[1:], threeprime1, fiveprime, sps.fisher_exact(ctable)[1], chrom[0] ] + ctable[0] + ctable[1] + [name] + [diff] + [oro3p[0]]] # writer.writerow([chrom[1:], threprime1, fiveprime, sps.fisher_exact(ctable)[1], # chrom[0]] + ctable[0] + ctable[1] + [name] + [int(oro3p[0])-int(threeprime1)] + [oro3p[0]]) else: closest_annotated = find_wiggle(threeprime1, annotpos, maxdist=150) if closest_annotated == 150: nocanonical += 1 diff = min(closest_annotated, int(threeprime1) - int(oro3p[0])) entries += [[ chrom[1:], fiveprime, threeprime1, sps.fisher_exact(ctable)[1], chrom[0] ] + ctable[0] + ctable[1] + [name] + [diff] + [oro3p[0]]]
def bar_plot_enrichment(vals, xticklabels, figname): a1, a2, b1, b2, c1, c2 = vals plt.figure(figsize=(2.6, 2.6)) plot_positions = [1, 2, 3] width = 0.6 plot_vals = [100 * a2 / a1, 100 * b2 / b1, 100 * c2 / c1] p = plt.bar(plot_positions, plot_vals, width=width, color=['silver', 'k', 'firebrick']) ### s, p = stats.fisher_exact([[a1 - a2, a2], [b1 - b2, b2]]) print('Up:', s, p) p_label = '{:.1e}'.format(p) if p_label[-2] == '0': p_label = p_label[:-2] + p_label[-1] if p < 0.05: star_mark = "*" if p < 0.001: star_mark = "**" plt.text(plot_positions[1], plot_vals[1] * 1.02, star_mark, ha='center', fontsize=16) ### s, p = stats.fisher_exact([[a1 - a2, a2], [c1 - c2, c2]]) print('CTCF Up:', s, p) p_label = '{:.1e}'.format(p) if p_label[-2] == '0': p_label = p_label[:-2] + p_label[-1] if p < 0.05: star_mark = "*" if p < 0.001: star_mark = "**" plt.text(plot_positions[2], plot_vals[2] * 1.02, star_mark, ha='center', fontsize=16) plt.axes().set_xticks(plot_positions) plt.axes().set_xticklabels(xticklabels, rotation=25, ha='right', fontsize=15) # sns.despine(offset=None, trim=False) plt.ylabel('Down-regulated \n genes in shCTCF (%)', fontsize=16) plt.xlim([0.35, 3.6]) plt.ylim([0, 46]) plt.axes().tick_params(axis='x', direction='out', length=3, width=.8, colors='black') plt.axes().tick_params(axis='y', direction='out', length=3, width=.8, colors='black') plt.savefig(figname, bbox_inches='tight', pad_inches=0.02, dpi=600, transparent=True) plt.close()
def pval_fisher(tbl, *args): return stats.fisher_exact(tbl,*args)[1]
inner = list() outer = list() p_list = list() df1 = genetic_df[genetic_df['c1'] == genetic_df['c2']] genetic_inner = df1.shape[0] genetic_outer = genetic_df.shape[0] - genetic_inner df1 = snp_df[snp_df['c1'] == snp_df['c2']] a = df1.shape[0] b = snp_df.shape[0] - a inner.append(float(a) / genetic_inner) outer.append(float(b) / genetic_outer) c = genetic_inner - a d = genetic_df.shape[0] - a - b - c [odds, p] = fisher_exact([[a, b], [c, d]]) print([odds, p]) p_list.append(p) df1 = gene_df.loc[gene_df['c1'] == gene_df['c2']] a = df1.shape[0] b = gene_df.shape[0] - a inner.append(float(a) / genetic_inner) outer.append(float(b) / genetic_outer) c = genetic_inner - a d = genetic_df.shape[0] - a - b - c [odds, p] = fisher_exact([[a, b], [c, d]]) print([odds, p]) p_list.append(p) df1 = ppi_df.loc[ppi_df['c1'] == ppi_df['c2']]
async def enrichment(bedfile: pd.DataFrame, bedcol=8, groups=None, correct=True, okpval=10**-3): """ compute pairwise enrichment and correlation for a set of peaks mappe to a conscensus with each columns after the 7th one representing the signal of a given ChIP experiment over this conscensus. will present enrichment of row values in col values Args: ---- bedfile: df bed-like representing a conscensus set of peaks, and a set of values/foldchanges over it bedcol: int col where the bed information ends and signal information columns start correct: bool whether to correct for multiple hypothesis testing or not docorrelation: bool whether or not to compute correlation as well as enrichment okpval: float max pvalue over which to set the enrichment to 0 Returns: ------- a dataframe[values_name x values_name] of enrichment of row values in col values a dataframe[values_name x values_name] of correlation of values signal over the overlaps """ dat = bedfile[bedfile.columns[bedcol:]].values prob = dat.astype(bool).sum(0) / len(dat) enrichment = np.zeros( (dat.shape[1] if groups is None else len(set(groups)), dat.shape[1])) pvals = np.zeros( (dat.shape[1] if groups is None else len(set(groups)), dat.shape[1])) if groups is not None: for i in set(groups): overlapping = dat[groups == i] for j, val in enumerate(overlapping.T): # enrichment of j in i e, p = fisher_exact( [[len(val[val != 0]), len(val[val == 0])], [prob[j] * len(dat), (1 - prob[j]) * len(dat)]]) enrichment[i, j] = np.log2(e) pvals[i, j] = p else: for i, col in enumerate(dat.T): overlapping = np.delete(dat, i, axis=1)[col != 0] col = col[col != 0] add = 0 for j, val in enumerate(overlapping.T): if j == i: add = 1 enrichment[i, i] = 0 e, p = fisher_exact([[len(val[val != 0]), len(val[val == 0])], [ prob[j + add] * len(dat), (1 - prob[j + add]) * len(dat) ]]) enrichment[i, j + add] = np.log2(e) pvals[i, j + add] = p enrichment[i, i] = 0 enrichment = pd.DataFrame( data=enrichment, index=bedfile.columns[bedcol:] if groups is None else set(groups), columns=bedfile.columns[bedcol:]).T enrichment[enrichment == -np.inf] = -1000 enrichment[enrichment.isna()] = 0 enrichment[enrichment == np.inf] = 1000 if correct: pvals = np.reshape( multipletests(pvals.ravel(), 0.1, method="bonferroni")[1], pvals.shape) pvals = pd.DataFrame( data=pvals, index=bedfile.columns[bedcol:] if groups is None else set(groups), columns=bedfile.columns[bedcol:]).T enrichment[pvals > okpval] = 0 return enrichment, pvals
for seg_index in breaks_freq.seg_index.unique(): a = breaks_freq.loc[breaks_freq.seg_index == seg_index, "enh_id"].to_list()[0] # num simple enhancers b = enh_totals - a # num complex enhancers c = total_shuf_breaks.loc[total_shuf_breaks.seg_index == seg_index, "enh_id"].to_list() if len(c) > 0: c = c[0] else: c = 0 # num simple shuffle d = total_shuf_breaks.enh_id.sum() - c # num complex shuffle obs = [[a, b], [c, d]] OR, P = stats.fisher_exact(obs) table = sm.stats.Table2x2(obs) # get confidence interval odds_ci = table.oddsratio_confint() newdf = pd.DataFrame({ "seg_index": [seg_index], "a": [a], "b": [b], "c": [c], "d": [d], "OR": [OR], "P": [P], "ci_lower": [odds_ci[0]], "ci_upper": [odds_ci[1]] }) OR_dict[seg_index] = newdf #print(seg_index, obs, OR, P)
def call_differential_editing_sites(config_file): stability_value = 0.03 #value below which you may use a lower coverage for adding more samples to increase power min_disease_people = 5 #min number people supporting higher coverage for whch you may base stability off measurements off of min_control_people = 5 #min number control poeple supporting higher coverage for which you may base stability off of min_disease_people_5_cov = 10 #min disease number of people of 5 coverage you must have if needing to use unstable 5x coverage min_control_people_5_cov = 10 #min control number of people of 5 coverage you must have if needing to use unstable 5x coverage editing_file = './temp.csv' output_file = './editing_sites.with_stats_converted_disease.csv' #read in files editing_table = pd.read_csv(editing_file, sep='\t') #config_table = pd.read_csv(config_file,sep=',',header=None) config_table = pd.read_csv(config_file, sep=',', skiprows=1, header=None) all_people = config_table[0] disease_people = config_table[0][config_table[1] == "DIS"].reset_index( drop=True) #TODO Change do disease!!! control_people = config_table[0][config_table[1] == "CTRL"].reset_index( drop=True) #TODO Change to control!!! #now get just an editing table and coverage table edit_level_table = editing_table[all_people] #edit_level_table = editing_table[np.r_[all_people]] def get_editing_levels_for_cov_table(i): info = i.astype(str).str.split(pat="\\^") editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[2]) return editing_levels cov_table = edit_level_table.apply(get_editing_levels_for_cov_table) cov_table = cov_table.apply(lambda x: pd.to_numeric( x)) #TODO check if as.numeric and pandas to_numeric do the same. def get_editing_levels(i): info = i.astype(str).str.split(pat="\\^") editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[0]) return editing_levels edit_level_table = edit_level_table.apply(get_editing_levels) edit_level_table = edit_level_table.apply( lambda x: pd.to_numeric(x)) #TODO check precision on R and python #go down line by line and get the prevalence info and mean editing levels based off of stable coverages #WARNING I'm using float here, not integer allowing NaN values. Is ok? coverage_threshold_used = np.repeat( 0., edit_level_table.shape[0] ) #will hold the coverage threshold required for this editing site stability_based_on = np.repeat( 0., edit_level_table.shape[0] ) #will hold what coverage stability requirements were determined stable_mean_disease_editing_level = np.repeat( 0., edit_level_table.shape[0] ) #mean autistic editing level using individuals passing coverage threshold stable_std_dev_disease_editing_level = np.repeat( 0., edit_level_table.shape[0] ) #standard deviation of autistic editing level using individuals passing coverage threshold stable_mean_control_editing_level = np.repeat( 0., edit_level_table.shape[0] ) #mean control editing level using individuals passing coverage threshold stable_std_dev_control_editing_level = np.repeat( 0., edit_level_table.shape[0] ) #standard deviation of control editing level using individuals passing coverage threshold stable_number_disease_with_at_least_min_coverage = np.repeat( 0., edit_level_table.shape[0] ) #number of autistic individuals passing the coverage threshold stable_number_disease_nonzero_editing_and_min_coverage = np.repeat( 0., edit_level_table.shape[0] ) #number of autistic individuals without non zero editing level and passing coverage threshold stable_disease_prevalence = np.repeat( 0., edit_level_table.shape[0] ) #proportion autistic individuals with nonzero editing stable_number_control_with_at_least_min_coverage = np.repeat( 0., edit_level_table.shape[0]) #same as disease but for control subjects stable_number_control_nonzero_editing_and_min_coverage = np.repeat( 0., edit_level_table.shape[0]) stable_control_prevalence = np.repeat(0., edit_level_table.shape[0]) stable_total_number_individuals_nonzero_editing_and_min_coverage = np.repeat( 0., edit_level_table.shape[0] ) #total number of disease and control subjects passing the coverage threshold and having nonzero editing level stable_mann_whitney_p_value = np.repeat( 0., edit_level_table.shape[0] ) #wilcoxon rank sum test p value using individuals passing the coverage threshold stable_editing_level_effect_size = np.repeat( 0., edit_level_table.shape[0] ) #difference between mean disease and mean control stable_frequency_fishers_p_value = np.repeat( 0., edit_level_table.shape[0] ) #prevalence p value determined using two-tailed fisher's exact test stable_frequency_OR = np.repeat( 0., edit_level_table.shape[0]) #odds ratio of the fisher's exact teest stable_prevalence_effect_size = np.repeat( 0., edit_level_table.shape[0] ) #difference in editing level prevalences between disease and control subjects #WARNING those are np arrays. for i in range(0, edit_level_table.shape[0]): print i #keep track of progress disease_edit_row = edit_level_table.loc[i, disease_people] control_edit_row = edit_level_table.loc[i, control_people] disease_cov_row = cov_table.loc[i, disease_people] control_cov_row = cov_table.loc[i, control_people] #find what coverage we can base stability off of number_disease_20_cov = disease_cov_row[disease_cov_row >= 20].count() number_control_20_cov = control_cov_row[control_cov_row >= 20].count() number_disease_15_cov = disease_cov_row[disease_cov_row >= 15].count() number_control_15_cov = control_cov_row[control_cov_row >= 15].count() number_disease_10_cov = disease_cov_row[disease_cov_row >= 10].count() number_control_10_cov = control_cov_row[control_cov_row >= 10].count() number_disease_5_cov = disease_cov_row[disease_cov_row >= 5].count() number_control_5_cov = control_cov_row[control_cov_row >= 5].count() if number_disease_20_cov >= min_disease_people and number_control_20_cov >= min_control_people: stability_based_on[i] = 20 elif number_disease_15_cov >= min_disease_people and number_control_15_cov >= min_control_people: stability_based_on[i] = 15 elif number_disease_10_cov >= min_disease_people and number_control_10_cov >= min_control_people: stability_based_on[i] = 10 elif number_disease_5_cov >= min_disease_people_5_cov and number_control_5_cov >= min_control_people_5_cov: stability_based_on[i] = 5 else: #stability_based_on[i] = -99999 # there's no np.nan integer representation, only float. We use an invalid value. stability_based_on[i] = float('nan') #need to deal with cases where there just are not enough disease individuals or control individuals to calculate mean if np.isnan(stability_based_on[i]): coverage_threshold_used[ i] = 5 #I warn users not to use editing sites that don't have any stability_based_on measurement. We include min coverage of 5 just to get statistical information anyways #stable_min_cov=5 #otherwise we can now try to find the stable_min_cov that'll be used for calculation of all statistics' else: current_stability_cov = stability_based_on[i] stability_disease_mean = disease_edit_row[ disease_cov_row >= current_stability_cov].mean() stability_control_mean = control_edit_row[ control_cov_row >= current_stability_cov].mean() #print np.arange(5,stability_based_on[i]+1e-4,5) for j in np.arange( 5, stability_based_on[i] + 1e-4, 5): #WARNING using 1e-4 allowing to include stop disease_mean = disease_edit_row[disease_cov_row >= j].mean() control_mean = control_edit_row[control_cov_row >= j].mean() if np.absolute(disease_mean - stability_disease_mean ) <= stability_value and np.absolute( control_mean - stability_control_mean) <= stability_value: coverage_threshold_used[i] = j break #now let's calculate all our statics based on the stable coverage threshold stable_min_cov = coverage_threshold_used[i] disease_adju_edit_row = disease_edit_row[np.logical_and( np.logical_and((~np.isnan(disease_edit_row)), (~np.isnan(disease_cov_row))), (disease_cov_row >= stable_min_cov))] disease_adju_cov_row = disease_cov_row[np.logical_and( (~np.isnan(disease_cov_row)), (disease_cov_row >= stable_min_cov))] control_adju_edit_row = control_edit_row[np.logical_and( np.logical_and((~np.isnan(control_edit_row)), (~np.isnan(control_cov_row))), (control_cov_row >= stable_min_cov))] control_adju_cov_row = control_cov_row[np.logical_and( (~np.isnan(control_cov_row)), (control_cov_row >= stable_min_cov))] stable_mean_disease_editing_level[i] = disease_adju_edit_row.mean() stable_std_dev_disease_editing_level[i] = disease_adju_edit_row.std() stable_mean_control_editing_level[i] = control_adju_edit_row.mean() stable_std_dev_control_editing_level[i] = control_adju_edit_row.std() stable_number_disease_with_at_least_min_coverage[ i] = disease_adju_cov_row[ disease_adju_cov_row >= stable_min_cov].count() stable_number_disease_nonzero_editing_and_min_coverage[ i] = disease_adju_cov_row[(~np.isnan(disease_adju_cov_row)) & (disease_adju_cov_row >= stable_min_cov) & (disease_adju_edit_row > 0)].count() stable_disease_prevalence[ i] = stable_number_disease_nonzero_editing_and_min_coverage[ i] / stable_number_disease_with_at_least_min_coverage[i] stable_number_control_with_at_least_min_coverage[ i] = control_adju_cov_row[ control_adju_cov_row >= stable_min_cov].count() stable_number_control_nonzero_editing_and_min_coverage[ i] = control_adju_cov_row[(~np.isnan(control_adju_cov_row)) & (control_adju_cov_row >= stable_min_cov) & (control_adju_edit_row > 0)].count() stable_control_prevalence[ i] = stable_number_control_nonzero_editing_and_min_coverage[ i] / stable_number_control_with_at_least_min_coverage[i] stable_total_number_individuals_nonzero_editing_and_min_coverage[i] = ( stable_number_disease_nonzero_editing_and_min_coverage[i] + stable_number_control_nonzero_editing_and_min_coverage[i]).sum() if (len(disease_adju_edit_row) >= 1) & (len(control_adju_edit_row) >= 1): if (np.all(disease_adju_edit_row.values == control_adju_edit_row.values)): stable_mann_whitney_p_value[i] = float('nan') else: temp, stable_mann_whitney_p_value[i] = mannwhitneyu( disease_adju_edit_row, control_adju_edit_row, alternative='two-sided') else: stable_mann_whitney_p_value[i] = float('nan') stable_editing_level_effect_size[i] = np.absolute( stable_mean_disease_editing_level[i] - stable_mean_control_editing_level[i]) fisher_matrix = np.matrix( [[ stable_number_disease_nonzero_editing_and_min_coverage[i], stable_number_disease_with_at_least_min_coverage[i] - stable_number_disease_nonzero_editing_and_min_coverage[i] ], [ stable_number_control_nonzero_editing_and_min_coverage[i], stable_number_control_with_at_least_min_coverage[i] - stable_number_control_nonzero_editing_and_min_coverage[i] ]]) stable_frequency_OR[i], stable_frequency_fishers_p_value[ i] = fisher_exact(fisher_matrix) #print stable_frequency_OR[i] #print stable_frequency_fishers_p_value[i] stable_prevalence_effect_size[i] = np.absolute( stable_disease_prevalence[i] - stable_control_prevalence[i]) #now put everything back together as a table header_info = editing_table[['chromosome', 'position', 'type_editing']] stats_table = pd.DataFrame(coverage_threshold_used) stats_table = stats_table.rename( columns={stats_table.columns[0]: 'coverage_threshold_used'}) stats_table['stability_based_on'] = pd.DataFrame(stability_based_on) stats_table['stable_mean_disease_editing_level'] = pd.DataFrame( stable_mean_disease_editing_level) stats_table['stable_std_dev_disease_editing_level'] = pd.DataFrame( stable_std_dev_disease_editing_level) stats_table['stable_mean_control_editing_level'] = pd.DataFrame( stable_mean_control_editing_level) stats_table['stable_std_dev_control_editing_level'] = pd.DataFrame( stable_std_dev_control_editing_level) stats_table[ 'stable_number_disease_with_at_least_min_coverage'] = pd.DataFrame( stable_number_disease_with_at_least_min_coverage) stats_table[ 'stable_number_disease_nonzero_editing_and_min_coverage'] = pd.DataFrame( stable_number_disease_nonzero_editing_and_min_coverage) stats_table['stable_disease_prevalence'] = pd.DataFrame( stable_disease_prevalence) stats_table[ 'stable_number_control_with_at_least_min_coverage'] = pd.DataFrame( stable_number_control_with_at_least_min_coverage) stats_table[ 'stable_number_control_nonzero_editing_and_min_coverage'] = pd.DataFrame( stable_number_control_nonzero_editing_and_min_coverage) stats_table['stable_control_prevalence'] = pd.DataFrame( stable_control_prevalence) stats_table[ 'stable_total_number_individuals_nonzero_editing_and_min_coverage'] = pd.DataFrame( stable_total_number_individuals_nonzero_editing_and_min_coverage) stats_table['stable_mann_whitney_p_value'] = pd.DataFrame( stable_mann_whitney_p_value) stats_table['stable_editing_level_effect_size'] = pd.DataFrame( stable_editing_level_effect_size) stats_table['stable_frequency_fishers_p_value'] = pd.DataFrame( stable_frequency_fishers_p_value) stats_table['stable_frequency_OR'] = pd.DataFrame(stable_frequency_OR) stats_table['stable_prevalence_effect_size'] = pd.DataFrame( stable_prevalence_effect_size) full_table = pd.concat( [header_info, stats_table, editing_table[all_people]], axis=1) #write the full_table to output full_table.to_csv(output_file, sep='\t', index=False) print "job completed\n"
def pathway_enrichment_analysis_pw_bg(input_genes, input_bg, db, enrichment, p_threshold, exclude_unique_pw): enriched_pathways = {} input_genes = set(input_genes) input_count = len(input_genes) pathways = db['dict'] bg = set(input_bg) for pathway in pathways: if exclude_unique_pw: # Consider only pathway genes annotated in input bg pathway_genes = pathways[pathway]['genes'].intersection(bg) else: pathway_genes = pathways[pathway]['genes'] bg = bg.union(pathway_genes) pathway_count = len(pathway_genes) bg_count = len(bg) overlap = input_genes.intersection(pathway_genes) overlap_count = len(overlap) if not overlap_count: continue pathway_only_count = pathway_count - overlap_count input_only_count = input_count - overlap_count bg_only_count = bg_count - pathway_count - input_count + overlap_count enrichment_coefficient = find_enrichment_coefficient(overlap_count, pathway_count, input_count, bg_count) if enrichment_coefficient <= 0 and enrichment == 'p': continue if enrichment_coefficient > 0 and enrichment == 'n': continue tail = find_tail(overlap_count, pathway_count, input_count, bg_count) try: odds_ratio, pvalue = stats.fisher_exact( [ [overlap_count, pathway_only_count], [input_only_count, bg_only_count] ], alternative=tail ) except ValueError: print('Something is wrong with this 2x2 table: \n') print([overlap_count, pathway_only_count], [input_only_count, bg_only_count]) break if pvalue > float(p_threshold): continue enriched_pathways[pathway] = { 'input_genes': input_count, 'db': pathways[pathway]['db'], 'length': pathway_count, 'overlap': overlap_count, 'pval': pvalue, 'enrichment': enrichment_coefficient, 'db_u_input': bg_count, 'metabolites': pathways[pathway]['metabolites'], 'overlap_genes': overlap } return enriched_pathways
def compare_groups(labels, results, low=None, high=None, num=100, comp_groups=None, print_skips=False): """ Function to plot proportion of largest and smallest bias groups and get relative z scores Parameters -------- labels : array_like contains categorical values like ['M', 'F'] results : array_like contains real numbers, e.g. threshold scores or floats in (0,1) low : float lower threshold value high : float upper threshold value num : int number of thresholds to check comp_groups : list of strings, optional subset of labels to compare, e.g. ['white', 'black'] print_skips : bool whether to display thresholds skipped Returns --------- min_props : dict contains (key, value) of (threshold : max group/min group proportions) z_ps : dict contains (key, value) of (threshold : p-value of two tailed z test) fisher_ps : dict contains (key, value) of (threshold : p-value of fisher exact test) chi_ps : dict contains (key, value) of (threshold : p-value of chi squared test) bayes_facts : dict contains (key, value) of (threshold : bayes factor) """ # cast labels and scores to pandas Series df = pd.DataFrame(list(zip(labels, results)), columns=['label', 'result']) min_props = {} fisher_ps = {} chi_ps = {} z_ps = {} bayes_facts = {} if comp_groups is not None: df = df[df['label'].isin(comp_groups)] # define range of values to test over if not inputted if low is None: low = min(results) if high is None: high = max(results) thresholds = np.linspace(low, high, num) skip_thresholds = [] for thresh in thresholds: df['dec'] = [i >= thresh for i in results] # compare rates of passing across groups ctabs = pd.crosstab(df['label'], df['dec']) # skip any thresholds for which the crosstabs are one-dimensional if 1 in ctabs.shape: skip_thresholds.append(thresh) continue normed_ctabs = ctabs.div(ctabs.sum(axis=1), axis=0) true_val = max(set(df['dec'])) max_group = normed_ctabs[true_val].max() normed_proportions = normed_ctabs[true_val] / max_group min_proportion = normed_proportions.min() # run statistical tests if ctabs.shape == (2, 2): test_results = test_multiple(df['label'].values, df['dec'].values) z_pval = test_results.get('z_score')[1] fisher_pval = test_results.get('fisher_p')[1] chi2_pval = test_results.get('chi2_p')[1] bayes_fact = test_results.get('BF') else: top_bottom_ctabs = top_bottom_crosstab(df['label'], df['dec']) z_pval = crosstab_ztest(top_bottom_ctabs)[1] fisher_pval = fisher_exact(top_bottom_ctabs)[1] chi2_pval = chi2_contingency(ctabs)[1] bayes_fact = crosstab_bayes_factor(ctabs) min_props[thresh] = min_proportion z_ps[thresh] = z_pval fisher_ps[thresh] = fisher_pval chi_ps[thresh] = chi2_pval bayes_facts[thresh] = bayes_fact if len(skip_thresholds) > 0 and print_skips: print('One-dimensional thresholds were skipped: %s' % skip_thresholds) return min_props, z_ps, fisher_ps, chi_ps, bayes_facts
def test_multiple(labels, decisions, tests=('ztest', 'fisher', 'chi2', 'BF', 'prop'), display=False): """ Function that returns p_values for z-score, fisher exact, and chi2 test of 2x2 crosstab of passing rate by labels and decisions See docs for z_test_ctabs, fisher_exact, chi2_contingency and bf_ctabs for details of specific tests Parameters ---------- labels : array_like categorical labels for each corresponding value of `decision` ie. M/F decisions : array_like binary decision values, ie. True/False or 0/1 tests : list a list of strings specifying the tests to run, valid options are 'ztest', 'fisher', 'chi2' and 'bayes'. Defaults to all four. -ztest: p-value for two-sided z-score for proportions -fisher: p-value for Fisher's exact test for proportions -chi2: p-value for chi-squared test of independence for proportions -bayes: bayes factor for independence assuming uniform prior -prop: proportion of lowest to highest passing rates by group display : bool print the results of each test in addition to returning them Returns ------- results : dict dictionary of values, one for each test. Valid keys are: 'z_score', 'fisher_p', 'chi2_p', 'BF', and 'prop' Examples -------- >>> # no real difference between groups >>> labels = ['group1']*100 + ['group2']*100 + ['group3']*100 >>> decisions = [1,0,0]*100 >>> all_test_ctabs(dependent_ctabs) (0.0, 1.0, 1.0, 0.26162148804907587) >>> # massively biased ratio of hits/misses by group >>> ind_ctabs = np.array([[75,50],[25,50]]) >>> all_test_ctabs(ind_ctabs) (-3.651483716701106, 0.0004203304586999487, 0.0004558800052056139, 202.95548692414306) >>> # correcting with a biased prior >>> biased_prior = np.array([[5,10],[70,10]]) >>> all_test_ctabs(ind_ctabs, biased_prior) (-3.651483716701106, 0.0004203304586999487, 0.0004558800052056139, 0.00012159518854984268) """ decisions = boolean_array(decisions) crosstab = pd.crosstab(pd.Series(labels), pd.Series(decisions)) crosstab = crosstab.values # can only perform 2-group z-tests & fisher tests # getting crosstabs for groups with highest and lowest pass rates # as any difference between groups is considered biased tb_crosstab = top_bottom_crosstab(labels, decisions) results = {} if 'ztest' in tests: results['z_score'] = crosstab_ztest(tb_crosstab) if 'fisher' in tests: # although fisher's exact can be generalized to multiple groups # scipy is limited to shape (2, 2) # TODO make generalized fisher's exact test # returns oddsratio and p-value results['fisher_p'] = fisher_exact(tb_crosstab)[:2] if 'chi2' in tests: # returns chi2 test statistic and p-value results['chi2_p'] = chi2_contingency(crosstab)[:2] if 'BF' in tests: results['BF'] = crosstab_bayes_factor(crosstab) if 'prop' in tests: results['prop'] = min(proportion_test(labels, decisions)) if display: for key in results: print("%s: %f" % (key, results[key])) return results
c = {} fraction1 = (len(cna_ais) - cna_ais.count(0)) / len(cna_ais) fraction2 = (len(cna_inv) - cna_inv.count(0)) / len(cna_inv) ais_gain = (cna_ais.count(1)) / len(cna_ais) ais_loss = (cna_ais.count(-1)) / len(cna_ais) inv_gain = (cna_inv.count(1)) / len(cna_inv) inv_loss = (cna_inv.count(-1)) / len(cna_inv) c["Gain"] = inv_gain - ais_gain c["Lost"] = inv_loss - ais_loss t, pval = stats.fisher_exact( [[len(cna_ais) - cna_ais.count(0), cna_ais.count(0)], [len(cna_inv) - cna_inv.count(0), cna_inv.count(0)]]) print i, cna_ais.count(0), cna_inv.count(0), c, t, pval t, pval_gain = stats.fisher_exact([[ais_gain, cna_ais.count(0)], [inv_gain, cna_inv.count(0)]]) print i, pval_gain t, pval_loss = stats.fisher_exact([[ais_loss, cna_ais.count(0)], [inv_loss, cna_inv.count(0)]]) print i, pval_loss p_3[i] = c if i in "6p": gain = df3[df3["CNA"] == 1]
def get_info_gatk(info, format, tumor, normal): '''estrae le informazioni dal vcf di mutect''' mutect = Variantcaller() mutect.GT = tumor[format.index('GT')] if mutect.GT == '0|1' or mutect.GT == '1|0': mutect.GT = '0/1' if len(mutect.GT.split('/')) > 2: mutect.GT = '0/1' [mutect.RO, mutect.AO] = [int(a) for a in tumor[format.index('AD')].split(',')] mutect.DP = mutect.AO + mutect.RO mutect.AF = round(float(mutect.AO) / float(mutect.DP), 4) try: mutect.GQ = float(tumor[format.index('GQ')]) except: mutect.GQ = '.' try: mutect.PGT = tumor[format.index('PGT')] except: mutect.PGT = '.' try: mutect.PID = tumor[format.index('PID')] except: mutect.PID = '.' try: mutect.RO_f, mutect.RO_r, mutect.AO_f, mutect.AO_r = tumor[ format.index('SB')].split(',') except: print(tumor) R = (float(mutect.RO_f) + 1) * (float(mutect.AO_r) + 1) / ( float(mutect.RO_r) + 1) * (float(mutect.AO_f) + 1) SymmetricRatio = R + 1 / R RefRatio = min((float(mutect.RO_f) + 1), (float(mutect.RO_r) + 1)) / max( (float(mutect.RO_f) + 1), (float(mutect.RO_r) + 1)) AltRatio = min((float(mutect.AO_f) + 1), (float(mutect.AO_r) + 1)) / max( (float(mutect.AO_f) + 1), (float(mutect.AO_r) + 1)) mutect.StOR = np.log(SymmetricRatio) + np.log(RefRatio) - np.log(AltRatio) mutect.DP_r = float(mutect.RO_r) + float(mutect.AO_r) mutect.DP_f = float(mutect.RO_f) + float(mutect.AO_f) if opts.amplicon: if min(mutect.DP_r, mutect.DP_f) / (mutect.DP_r + mutect.DP_f) >= 0.05: mutect.FStBias = 1 - stats.fisher_exact( [[mutect.RO_f, mutect.RO_r], [mutect.AO_f, mutect.AO_r]])[1] else: mutect.FStBias = '1.0' else: if min(mutect.DP_r, mutect.DP_f) / (mutect.DP_r + mutect.DP_f) > 0: mutect.FStBias = 1 - stats.fisher_exact( [[mutect.RO_f, mutect.RO_r], [mutect.AO_f, mutect.AO_r]])[1] else: mutect.FStBias = '1.0' for ind in info: if ind.startswith("CONTQ="): mutect.CONTQ = ind.split('=')[1] if ind.startswith("ECNT="): mutect.ECNT = ind.split('=')[1] if ind.startswith("GERMQ="): mutect.GERMQ = ind.split('=')[1] if ind.startswith("MBQ="): mutect.MBQ_ref, mutect.MBQ_alt = ind.split('=')[1].split(',') if ind.startswith("MFRL="): mutect.MFRL_ref, mutect.MFRL_alt = ind.split('=')[1].split(',') if ind.startswith("MMQ="): mutect.MMQ_ref, mutect.MMQ_alt = ind.split('=')[1].split(',') if ind.startswith("MPOS="): mutect.MPOS = ind.split('=')[1] if ind.startswith("POPAF="): mutect.POPAF = ind.split('=')[1] if ind.startswith("PON"): mutect.PON = '1' if ind.startswith("RPA="): mutect.RPA_ref, mutect.RPA_alt = ind.split('=')[1].split(',') if ind.startswith("RU="): mutect.RU = ind.split('=')[1] if ind == "STR": mutect.STR = '1' if ind.startswith("SEQQ="): mutect.SEQQ = float(ind.split('=')[1]) if ind.startswith("STRANDQ="): mutect.STRANDQ = float(ind.split('=')[1]) if ind.startswith("STRQ="): mutect.STRQ = float(ind.split('=')[1]) if ind.startswith("TLOD="): mutect.tumor_lod = float(ind.split('=')[1]) return mutect
def pvalue_fisher(x, a): a0 = x - a oddsratio, pvalue_f = stats.fisher_exact([[a, n1 - a], [a0, n0 - a0]]) return pvalue_f
def frac_correct(bdata): ''' Calculate the fraction of correct trials overall for a bdata object. Args: bdata (jaratoolbox.loadbehavior.BehaviorData dict): the behavior data to use Returns: nCorrect (int): Number of correct trials nValid (int): Number of valid trials ''' correct = bdata['outcome'] == bdata.labels['outcome']['correct'] nCorrect = sum(correct) valid = bdata['valid'] nValid = sum(valid) # return nCorrect/float(nValid) return nCorrect, nValid ncs, nvs = frac_correct(sdata) ncm, nvm = frac_correct(mdata) nis = nvs - ncs nim = nvm - ncm from scipy.stats import fisher_exact oddsratio, pval = fisher_exact([[ncs, nis], [ncm, nim]]) print oddsratio print pval
def time_fisher_exact(self, alternative): oddsratio, pvalue = stats.fisher_exact(self.a, alternative=alternative)
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.") parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\".") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.") (options,args)=parser.parse_args() print () #print (options.paired) #print (options.welch_ttest) if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.pval.txt','w') #ROUT = open(options.out_file + '.r','w') printlog("Read group file \"%s\" ..." % (options.group_file)) (s,g) = read_grp_file1(options.group_file) s2g = dict(zip(s,g)) g2s = collections.defaultdict(list) for k,v in s2g.items(): g2s[v].append(k) group_IDs = sorted(g2s.keys()) for g in group_IDs: print ("\tGroup %s has %d samples:" % (g, len(g2s[g]))) print ('\t\t' + ','.join(g2s[g])) if len(group_IDs) != 2: printlog("You must have two groups!", file=sys.stderr) sys.exit(1) line_num = 1 probe_list = [] p_list = [] or_list = [] for l in ireader.reader(options.input_file): f = l.split() if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in s2g: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) else: cg_id = f[0] probe_list.append(cg_id) proportions = f[1:] methyl_reads = 0 unmethyl_reads = 0 g2values = collections.defaultdict(dict) for g in group_IDs: g2values[g]['methyl'] = 0 g2values[g]['unmethyl'] = 0 for s,p in zip(sample_IDs, proportions): gid = s2g[s] m = re.match(r'(\d+)\s*\,\s*(\d+)', p) if m is None: continue else: c = int(m.group(1)) n = int(m.group(2)) if n >= c and n > 0: g2values[gid]['methyl'] += c g2values[gid]['unmethyl'] += (n-c) else: printlog("Incorrect data format!") print (f) sys.exit(1) (odds, pval) = stats.fisher_exact([ [g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl']],[g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']] ]) #print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']) p_list.append(pval) or_list.append(odds) line_num += 1 printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...") adjusted_p = {} q_list = padjust.multiple_testing_correction(p_list) for id,o,p,q in zip(probe_list, or_list, p_list, q_list): adjusted_p[id] = '\t'.join([str(i) for i in (o,p,q)]) printlog("Writing to %s" % (options.out_file + '.pval.txt')) line_num = 1 for l in ireader.reader(options.input_file): if line_num == 1: print (l + '\tOddsRatio\tpval\tadj.pval', file=FOUT) else: f = l.split() probe_ID = f[0] print (l + '\t' + adjusted_p[probe_ID], file=FOUT) line_num += 1 FOUT.close()