def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None): m = analyzePhenotype._calcMedian_(pvals,exp_median) ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals) s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0 ks_stat = ks_res["D"] ks_pvalue = ks_res["p.value"] quantiles = analyzePhenotype._getQuantiles_(pvals, 1000) #exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000) a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles) return (m,a,ks_stat,ks_pvalue,s)
def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False): def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None): m = analyzePhenotype._calcMedian_(pvals,exp_median) ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals) s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0 ks_stat = ks_res["D"] ks_pvalue = ks_res["p.value"] quantiles = analyzePhenotype._getQuantiles_(pvals, 1000) #exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000) a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles) return (m,a,ks_stat,ks_pvalue,s) if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps #Calc norm stats, and est. p-value # print "running old KW" # t1 = time.time() # pvals = analyzeHaplotype._run_kw_(snps,phenVals) # t2 = time.time() # print "Took",t2-t1,"seconds." if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list = [] for i in range(0,numPerm):#For every perm if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) print i random.shuffle(phenVals) #Permute phenotype #pvals = analyzeHaplotype._run_kw_(snps,phenVals) #Run KW if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) print "Combining p-values" quantiles = [] all_pvals = [] for pvals in perm_pvalues_list: for pval in pvals: all_pvals.append(pval) print len(all_pvals),"permuted pvals in all" quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000) print "len(quantiles):", len(quantiles) exp_median = (quantiles[499]+quantiles[500])/2.0 (true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals) m_list = [] a_list = [] ks_stat_list = [] ks_pvalue_list = [] s_list = [] for i in range(0,numPerm): pvals = perm_pvalues_list[i] (m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic m_list.append(m) a_list.append(a) s_list.append(s) ks_stat_list.append(ks_stat) ks_pvalue_list.append(ks_pvalue) del all_pvals,quantiles if savePermutations: permOutputFile = outputFile+".perm.pvals" print "Writing to",permOutputFile f = open(permOutputFile,"w") i = 0 for pvals in perm_pvalues_list: pvals_str = map(str,pvals) f.write(",".join(pvals_str)+"\n") print "Done writing to",permOutputFile f.close() #Output results outputFile = outputFile+".perm.stat.txt" f = open(outputFile,"w") f.write("Perm_nr, median, area, ks_stat, s_stat \n") for i in range(0,numPerm): str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]]) f.write(", ".join(str_l)+"\n") f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n") pvals = [0.0,0.0,0.0,0.0] #M stat p-value (two sided) #Assuming symm. dist. for i in range(0,numPerm): if abs(true_m) <= abs(m_list[i]): pvals[0]+=1.0/numPerm #A stat p-value (one tailed) for i in range(0,numPerm): if true_a <= a_list[i]: pvals[1]+=1.0/numPerm #KS stat p-value (one tailed) for i in range(0,numPerm): if true_ks_stat <= ks_stat_list[i]: pvals[2]+=1.0/numPerm #S stat p-value (one tailed) for i in range(0,numPerm): if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)): pvals[3]+=1.0/numPerm for i in range(0,len(pvals)): if pvals[i] == 0.0: pvals[i] = 0.5*(1.0/numPerm) str_pvals = map(str,pvals) f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n") f.close() #Plot results pngFile_median = outputFile+".perm.m.png" pngFile_area = outputFile+".perm.a.png" pngFile_ks = outputFile+".perm.ks.png" pngFile_s = outputFile+".perm.s.png" def _getBinning_(n_bins,min_val,max_val): bins = [] delta = (max_val-min_val)/n_bins start_val = min_val-delta*0.5 for i in range(0,n_bins+2): bins.append(start_val+delta*i) return (bins,delta) n_bins = 20+int(4*(math.log(numPerm))) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt min_val = min(min(m_list),true_m) max_val = max(max(m_list),true_m) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val]) plt.hist([true_m], bins = bins)#, range=[start_val,end_val]) plt.savefig(pngFile_median, format = "png") plt.legend() plt.clf() min_val = min(min(a_list),true_a) max_val = max(max(a_list),true_a) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(a_list+[true_a], bins = bins) plt.hist([true_a], bins = bins) plt.savefig(pngFile_area, format = "png") plt.clf() min_val = min(min(ks_stat_list),true_ks_stat) max_val = max(max(ks_stat_list),true_ks_stat) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(ks_stat_list+[true_ks_stat], bins = bins) plt.hist([true_ks_stat], bins = bins) plt.savefig(pngFile_ks, format = "png") plt.clf() min_val = min(min(s_list),true_s) max_val = max(max(s_list),true_s) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(s_list+[true_s], bins = bins) plt.hist([true_s], bins = bins) plt.savefig(pngFile_s, format = "png") plt.clf()