def run_kw(snpsd,phend,phen_i,chromosome,with_missing_vals=True): pvals = [] print "Running KW on",len(snpsd.snps),"snps." if with_missing_vals: snpsd_indices = [] for i in range(len(snpsd.snps)): (snp,phen_vals) = snpsd.get_snp_phen_pair(i,phend,phen_i,missingVal='NA') # print "Phen NA count:",phen_vals.count("NA") # print "SNP NA count:",snp.count("NA") # print "Running KW on",len(phen_vals),"phenotype values." if len(set(snp))>1 and len(set(phen_vals))>1: res = util.kruskal_wallis([snp],phen_vals) pvals.append(res["ps"][0]) snpsd_indices.append(i) else: pvals.append(1) #snpsd.filter_snp_indices(snpsd_indices) else: res = util.kruskal_wallis(snpsd.snps,phen_vals) pvals = res["ps"] #print pvals gwas_result = gwaResults.Result(snpsds=[snpsd],name="KW_"+str(phen_i),phenotypeID=phen_i,scores=pvals,chromosomes=[chromosome]) #gwas_result = gwaResults.Result(name="KW_"+str(phen_i),phenotypeID=phen_i,scores=pvals,chromosomes=[chromosome]) #return {'ps':pvals,'positions':positions,'snpsd_indices':snpsd_indices} return gwas_result, snpsd
def get_perm_pvals(snps, phen_vals, mapping_method='kw', num_perm=100, snps_filter=0.05): import random if snps_filter < 1.0: snps = random.sample(snps, int(snps_filter * len(snps))) pvals = [] if mapping_method == 'kw': for i in range(num_perm): random.shuffle(phen_vals) kw_res = util.kruskal_wallis(snps, phen_vals, verbose=False) pvals.extend(kw_res['ps']) elif mapping_method == 'ft': for i in range(num_perm): random.shuffle(phen_vals) pvals.extend(run_fet(snps, phen_vals)) return pvals
def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1,test_type = "KW",): """ Leave one out test.. """ new_all_snps = [] for snp in all_snps: if snp.count(0)>1 and snp.count(1)>1: new_all_snps.append(snp) print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2." all_snps = new_all_snps if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." log_true_pvals = [] for pval in true_pvals: log_true_pvals.append(-math.log(pval,10)) perm_pvalues_list = [] for i in range(0,len(phenVals)): newPhenvals = phenVals[:] newPhenvals.pop(i) newSNPs = [] for snp in snps: newSNP = snp[:] newSNP.pop(i) newSNPs.append(newSNP) print i if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(newSNPs,newPhenvals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(newSNPs,newPhenvals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) delta_pvals_list = [] delta_log_pvals_list = [] for perm_pvals in perm_pvalues_list: log_pvals = [] delta_pvals = [] delta_log_pvals = [] for i in range(0,len(true_pvals)): pval = perm_pvals[i] true_pval = true_pvals[i] delta_pvals.append(true_pval-pval) log_true_pval = log_true_pvals[i] if pval > 0.0: log_pval = -math.log(pval,10) else: print "Damn those random 0 prob. events: event #", i log_pval = -math.log(true_pval,10) log_pvals.append(log_pval) delta_log_pvals.append(log_true_pval-log_pval) delta_pvals_list.append(delta_pvals) delta_log_pvals_list.append(delta_log_pvals) sd_log_pvals = [] sd_pvals = [] t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list)) t_delta_pvals_list = map(list,zip(*delta_pvals_list)) for i in range(0,len(true_pvals)): sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i])) sd_pvals.append(util.calcSD(t_delta_pvals_list[i])) #Write SDs out to file, to be able to replot, or plot together with other methods... etc import csv sd_log_pval_file = outputFile+".rob.log_pvals_sd" f = open(sd_log_pval_file,"w") w = csv.writer(f) w.writerow(["log_true_pval","sd_log_pvals"]) l = zip(log_true_pvals,sd_log_pvals) w.writerows(l) f.close() #Plot things.... pngFile_log_pvals = outputFile+".rob.log_pval.png" pngFile_pval = outputFile+".rob.pval.png" pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png" pngFile_sd_pval = outputFile+".rob.sd_pval.png" min_val = min(true_pvals) max_val = max(true_pvals) val_range = max_val-min_val min_log_val = min(log_true_pvals) max_log_val = max(log_true_pvals) log_val_range = max_val-min_val import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_log_pvals = delta_log_pvals_list[i] plt.plot(log_true_pvals,delta_log_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_log_pvals)) min_perm_val = min(min_perm_val,min(delta_log_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_log_pvals, format = "png") plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_pvals = delta_pvals_list[i] plt.plot(true_pvals,delta_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_pvals)) min_perm_val = min(min_perm_val,min(delta_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_log_pval = max(sd_log_pvals) min_sd_log_pval = min(sd_log_pvals) sd_val_range = max_sd_log_pval-min_sd_log_pval plt.plot(log_true_pvals,sd_log_pvals,"b.") plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_log_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_pval = max(sd_pvals) min_sd_pval = min(sd_pvals) sd_val_range = max_sd_pval-min_sd_pval plt.plot(true_pvals,sd_pvals,"b.") plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_pval, format = "png")
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness", "memReq=","walltimeReq=",] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False memReq = "5g" walltimeReq = "100:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--memReq"): memReq=arg elif opt in ("--walltimeReq"): walltimeReq=arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "complement:",complement print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness print "walltimeReq:",walltimeReq print "memReq:",memReq def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files #phed and phenotype sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) phenotypeName=phed.getPhenotypeName(phenotypeIndex) if phed.isBinary(phenotypeIndex): pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex)) else: snps = sd.getSnps() phen_vals = phed.getPhenVals(phenotypeIndex) try: kw_res = util.kruskal_wallis(snps,phen_vals) pvals = kw_res['ps'] except: print snps print phen_vals print len(snps),len(snps[0]),len(phen_vals) raise Exception res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False) pvalFile=outputFile+".pvals" res.writeToFile(pvalFile) print "Generating a GW plot." res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False): def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None): m = analyzePhenotype._calcMedian_(pvals,exp_median) ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals) s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0 ks_stat = ks_res["D"] ks_pvalue = ks_res["p.value"] quantiles = analyzePhenotype._getQuantiles_(pvals, 1000) #exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000) a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles) return (m,a,ks_stat,ks_pvalue,s) if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps #Calc norm stats, and est. p-value # print "running old KW" # t1 = time.time() # pvals = analyzeHaplotype._run_kw_(snps,phenVals) # t2 = time.time() # print "Took",t2-t1,"seconds." if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list = [] for i in range(0,numPerm):#For every perm if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) print i random.shuffle(phenVals) #Permute phenotype #pvals = analyzeHaplotype._run_kw_(snps,phenVals) #Run KW if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) print "Combining p-values" quantiles = [] all_pvals = [] for pvals in perm_pvalues_list: for pval in pvals: all_pvals.append(pval) print len(all_pvals),"permuted pvals in all" quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000) print "len(quantiles):", len(quantiles) exp_median = (quantiles[499]+quantiles[500])/2.0 (true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals) m_list = [] a_list = [] ks_stat_list = [] ks_pvalue_list = [] s_list = [] for i in range(0,numPerm): pvals = perm_pvalues_list[i] (m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic m_list.append(m) a_list.append(a) s_list.append(s) ks_stat_list.append(ks_stat) ks_pvalue_list.append(ks_pvalue) del all_pvals,quantiles if savePermutations: permOutputFile = outputFile+".perm.pvals" print "Writing to",permOutputFile f = open(permOutputFile,"w") i = 0 for pvals in perm_pvalues_list: pvals_str = map(str,pvals) f.write(",".join(pvals_str)+"\n") print "Done writing to",permOutputFile f.close() #Output results outputFile = outputFile+".perm.stat.txt" f = open(outputFile,"w") f.write("Perm_nr, median, area, ks_stat, s_stat \n") for i in range(0,numPerm): str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]]) f.write(", ".join(str_l)+"\n") f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n") pvals = [0.0,0.0,0.0,0.0] #M stat p-value (two sided) #Assuming symm. dist. for i in range(0,numPerm): if abs(true_m) <= abs(m_list[i]): pvals[0]+=1.0/numPerm #A stat p-value (one tailed) for i in range(0,numPerm): if true_a <= a_list[i]: pvals[1]+=1.0/numPerm #KS stat p-value (one tailed) for i in range(0,numPerm): if true_ks_stat <= ks_stat_list[i]: pvals[2]+=1.0/numPerm #S stat p-value (one tailed) for i in range(0,numPerm): if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)): pvals[3]+=1.0/numPerm for i in range(0,len(pvals)): if pvals[i] == 0.0: pvals[i] = 0.5*(1.0/numPerm) str_pvals = map(str,pvals) f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n") f.close() #Plot results pngFile_median = outputFile+".perm.m.png" pngFile_area = outputFile+".perm.a.png" pngFile_ks = outputFile+".perm.ks.png" pngFile_s = outputFile+".perm.s.png" def _getBinning_(n_bins,min_val,max_val): bins = [] delta = (max_val-min_val)/n_bins start_val = min_val-delta*0.5 for i in range(0,n_bins+2): bins.append(start_val+delta*i) return (bins,delta) n_bins = 20+int(4*(math.log(numPerm))) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt min_val = min(min(m_list),true_m) max_val = max(max(m_list),true_m) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val]) plt.hist([true_m], bins = bins)#, range=[start_val,end_val]) plt.savefig(pngFile_median, format = "png") plt.legend() plt.clf() min_val = min(min(a_list),true_a) max_val = max(max(a_list),true_a) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(a_list+[true_a], bins = bins) plt.hist([true_a], bins = bins) plt.savefig(pngFile_area, format = "png") plt.clf() min_val = min(min(ks_stat_list),true_ks_stat) max_val = max(max(ks_stat_list),true_ks_stat) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(ks_stat_list+[true_ks_stat], bins = bins) plt.hist([true_ks_stat], bins = bins) plt.savefig(pngFile_ks, format = "png") plt.clf() min_val = min(min(s_list),true_s) max_val = max(max(s_list),true_s) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(s_list+[true_s], bins = bins) plt.hist([true_s], bins = bins) plt.savefig(pngFile_s, format = "png") plt.clf()
from util import kruskal_wallis if __name__ == '__main__': kruskal_wallis('opentabs', int) kruskal_wallis('openwindows', int)
def _perform_gwas_(phen_id, phenData, analysis_method, transformation, genotype, kinship_type, kinshipFile=None, messenger=None, outputfile=None): additional_columns = {} messenger.update_status(progress=0.0, task_status='Loading genotype data') genotypeData = dataParsers.load_snps_call_method(genotype) #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype) K = None messenger.update_status(step=0.05, task_status='Preparing data') n_filtered_snps = _prepare_data_(genotypeData, phenData, phen_id) phen_vals = phenData.get_values(phen_id) if analysis_method in [ 'emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm', 'amm' ]: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if kinshipFile: #Kinship file was supplied.. messenger.update_status( progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile) print 'Loading supplied kinship file: %s' % kinshipFile K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions) else: messenger.update_status(progress=0.15, task_status='Loading kinship file') print 'Loading kinship file.' K = kinship.get_kinship(call_method_id=genotype, method=kinship_type, n_removed_snps=n_filtered_snps, remain_accessions=genotypeData.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") snps = genotypeData.getSnps() positions = genotypeData.getPositions() chromosomes = [] for i, (s, c) in enumerate( itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)): chromosomes.extend([c] * len(s.snps)) maf_dict = genotypeData.get_mafs() if analysis_method in ['kw']: messenger.update_status(progress=0.7, task_status='Performing KW') res = util.kruskal_wallis(snps, phen_vals) elif analysis_method in ['loc_glob_mm']: raise NotImplementedError elif analysis_method in ['emma']: res = lm.emma(snps, phen_vals, K) elif analysis_method in ['emmax', 'amm']: d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100) res = d['res'] #additional_columns['stats'] = d['stats'] elif analysis_method in ['lm']: d = lm.lin_reg_step(phen_vals, genotypeData, []) res = d['res'] #additional_columns['stats'] = d['stats'] else: raise Exception('analysis method %s not supported' % analysis_method) pvals = res['ps'] #Calculate Benjamini-Hochberg threshold bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05) #Calculate Median p-value med_pval = agr.calc_median(res['ps']) #Calculate the Kolmogorov-Smirnov statistic ks_res = agr.calc_ks_stats(res['ps']) quantiles_dict = _calculate_qqplot_data_(pvals) scores = map(lambda x: -math.log10(x), pvals) if analysis_method in ['lm', 'emma', 'emmax', 'amm']: additional_columns['genotype_var_perc'] = res['var_perc'] if 'betas' in res: betas = map(list, zip(*res['betas'])) additional_columns['beta0'] = betas[0] if len(betas) > 1: additional_columns['beta1'] = betas[1] #calculate ld if outputfile is None: outputfile = "%s.hdf5" % phen_id messenger.update_status(progress=0.8, task_status='Processing and saving results') _save_hdf5_pval_file(outputfile, analysis_method, transformation, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], quantiles_dict, ks_res, bh_thres_d['thes_pval'], med_pval, additional_columns)
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict): import copy phed = copy.deepcopy(phed) phenotype_name = phed.get_name(p_i) phen_is_binary = phed.is_binary(p_i) if trans_method == 'most_normal': trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) emmax_perm_threshold = None k = None res = None #Check whether result already exists. if p_dict['use_existing_results']: if p_dict['region_plots']: sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." snps = sd.getSnps() else: snps = None print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() #Loading candidate genes cand_genes = None if p_dict['cand_genes_file']: cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file']) else: cand_genes = None tair_ids = None if not res: #If results weren't found in a file... then do GWA. #Loading data sd = _get_genotype_data_(p_dict) num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) #Do we need to calculate the K-matrix? if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if p_dict['kinship_file']: #Kinship file was supplied.. print 'Loading supplied kinship file: %s' % p_dict['kinship_file'] k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions) else: print 'Loading kinship file.' if p_dict['data_file'] != None: if p_dict['kinship_type'] == 'ibs': k = sd.get_ibs_kinship_matrix() elif p_dict['kinship_type'] == 'ibd': k = sd.get_ibd_kinship_matrix() else: k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'], method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps, remain_accessions=sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: if num_outliers == 0: print "No outliers were removed!" phen_vals = phed.get_values(p_i) if p_dict['local_gwas']: #Filter SNPs, etc.. sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])], [p_dict['local_gwas'][0]], data_format=sd.data_format) snps = sd.getSnps() sys.stdout.write("Finished loading and handling data!\n") print "Plotting a histogram" p_her = None hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) hist_png_file = hist_file_prefix + "_hist.png" if k is not None: p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability'] p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval'] phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval) else: phed.plot_histogram(p_i, png_file=hist_png_file) print "Applying %s to data." % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if "kw" == mapping_method: if phen_is_binary: warnings.warn("Warning, applying KW to a binary phenotype") kw_res = util.kruskal_wallis(snps, phen_vals) pvals = kw_res['ps'] kwargs['statistics'] = kw_res['ds'] additional_columns.append('statistics') elif "ft" == mapping_method: raise NotImplementedError # pvals, or_est = run_fet(snps, phen_vals) # kwargs['odds_ratio_est'] = or_est # additional_columns.append('odds_ratio_est') else: #Parametric tests below: if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']: r = lm.mm_lrt_test(phen_vals, k) if r['pval'] > 0.05: print "Performing EMMA, even though a mixed model does not fit the data significantly better" print 'p-value: %0.3f' % r['pval'] else: print 'The mixed model fits the data significantly better than the simple linear model.' print 'p-value: %f' % r['pval'] if mapping_method in ['loc_glob_mm']: res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix, global_k=k, window_size=p_dict['loc_glob_ws'], jump_size=p_dict['loc_glob_ws'] / 2, kinship_method=p_dict['kinship_type']) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) return elif mapping_method in ['emma']: res = lm.emma(snps, phen_vals, k) elif mapping_method in ['emmax']: if p_dict['emmax_perm']: perm_sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates']) perm_sd.filter_mac_snps(p_dict['mac_threshold']) t_snps = perm_sd.getSnps() t_phen_vals = phed.get_values(p_i) res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm']) emmax_perm_threshold = res['threshold_05'][0] import pylab hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6) threshold = -sp.log10(emmax_perm_threshold) b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0)) pylab.vlines(threshold, 0, max(hist_res[0]), color='g') pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r') pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']), format='png') if p_dict['with_replicates']: #Get values, with ecotypes, construct Z and do GWAM phen_vals = phed.get_values(p_i) Z = phed.get_incidence_matrix(p_i) res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) else: res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) elif mapping_method in ['emmax_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals'], emma_num=p_dict['emmax_emma_num']) print 'Step-wise EMMAX finished!' return elif mapping_method in ['lm_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals']) print 'Step-wise LM finished!' return elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) elif mapping_method in ['emmax_anova']: res = lm.emmax_anova(snps, phen_vals, k) elif mapping_method in ['lm_anova']: res = lm.anova(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' return if mapping_method in ['lm', 'emma', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] additional_columns.append('genotype_var_perc') if p_dict['with_betas'] or mapping_method in ['emma' ]: betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] additional_columns.append('beta0') if len(betas) > 1: kwargs['beta1'] = betas[1] additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() if mapping_method in ['lm_anova', 'emmax_anova']: kwargs['genotype_var_perc'] = res['var_perc'] pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() # print 'Calculating SNP-phenotype correlations.' # kwargs['correlations'] = calc_correlations(snps, phen_vals) # additional_columns.append('correlations') print 'Writing result to file.' res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs) if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter']) #add results to DB.. if p_dict['add_to_db']: print 'Adding results to DB.' if p_dict['with_db_ids']: db_pid = p_i else: db_pid = phed.get_db_pid(p_i) import results_2_db as rdb short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name, mapping_method, trans_method, p_dict['remove_outliers'], str(p_dict['with_replicates'])) tm_id = transformation_method_dict[trans_method] try: rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid, analysis_methods_dict[mapping_method], tm_id, remove_outliers=p_dict['remove_outliers']) except Exception, err_str: print 'Failed inserting results into DB!' print err_str
def run_gwas(file_prefix, phen_file, start_i, stop_i, temperature, mac_threshold=15, filter_threshold=0.02, call_method_id=79, data_format='diploid_int', debug_filter=1.0, near_const_filter=20): """ GWAS """ phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids[start_i :stop_i] sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) print len(sd.accessions) K = sd.get_ibs_kinship_matrix() #K = dp.load_kinship(call_method_id=call_method_id, data_format=data_format, sd=sd, method='ibs') sd.filter_mac_snps(mac_threshold) snps = sd.getSnps() positions = sd.getPositions() chromosomes = sd.get_chr_list() r = sd.get_mafs() macs = r['mafs'] mafs = r['marfs'] print 'In total there are %d SNPs to be mapped.' % len(snps) gene_dict = dp.parse_tair_gff_file()#_load_genes_list_('rna_seq_031311_%sC' % temperature) for i, pid in enumerate(pids): if not pid in phed.phen_ids: continue gene_tair_id = phed.get_name(pid) # exons = [] # for isoform in d: # for exon in isoform['exons']: # exons.append((d['chromosome'], exon['start_pos'], exon['end_pos'])) d = gene_dict[gene_tair_id] gene_strand = d['strand'] try: chrom = int(d['chromosome']) except Exception: raise gene = gwaResults.Gene(chromosome=int(d['chromosome']), startPos=d['start_pos'], endPos=d['end_pos'], name=gene_tair_id, description=None, dbRef=gene_tair_id, tairID=gene_tair_id) print i, pid, gene curr_file_prefix = '%s_mac%d_pid%d_%s' % (file_prefix, mac_threshold, pid, gene_tair_id) trans_type, shapiro_pval = phed.most_normal_transformation(pid) print 'Most normal transformation was: %s' % trans_type #trans_type = 'None' summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':shapiro_pval} #summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':0} print'Applying Kruskal-Wallis' phen_vals = phed.get_values(pid) res = util.kruskal_wallis(snps, phen_vals) pvals = res['ps'].tolist() kw_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes) print 'Summarizing KW' summary_dict['KW'] = kw_res.get_gene_analysis(gene) summary_dict['KW']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['KW']['pval_median'] = agr.calc_median(res['ps']) print 'Applying LM' res = lm.linear_model(snps, phen_vals) pvals = res['ps'].tolist() perc_var_expl = res['var_perc'].tolist() lm_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) print 'Summarizing LM' summary_dict['LM'] = lm_res.get_gene_analysis(gene) summary_dict['LM']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['LM']['pval_median'] = agr.calc_median(res['ps']) print 'Applying EX Stepwise' snp_priors = sd.get_cand_genes_snp_priors([gene]) ex_sw_res = lm.emmax_step_wise(phen_vals, K, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, snps=snps, num_steps=5, cand_gene_list=[gene], with_qq_plots=False, log_qq_max_val=6.0, save_pvals=True, snp_priors=snp_priors) print 'Summarizing the step-wise mixed model' pvals = ex_sw_res['first_emmax_res']['ps'].tolist() perc_var_expl = ex_sw_res['first_emmax_res']['var_perc'].tolist() ex_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) summary_dict['EX'] = ex_res.get_gene_analysis(gene) summary_dict['pseudo_heritability'] = ex_sw_res['step_info_list'][0]['pseudo_heritability'] summary_dict['EX']['kolmogorov_smirnov'] = agr.calc_ks_stats(ex_sw_res['first_emmax_res']['ps']) summary_dict['EX']['pval_median'] = agr.calc_median(ex_sw_res['first_emmax_res']['ps']) #Does the linear mixed model fit the data better? summary_dict['MM_LRT'] = lm.mm_lrt_test(phen_vals, K) #FINISH summarizing the stepwise!!! summarize_stepwise(summary_dict, gene, ex_sw_res['step_info_list'], ex_sw_res['opt_dict']) cvt_dict = {'radius':{}, 'tss_upstream':{}} print 'Comparing cis vs. trans kinship' #Check 1 mb, 200kb, 100kb, 50kb, 20kb, 10kb, 2kb, 0kb for radius in [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]: print radius r_start_pos = max(gene.startPos - radius, 0) r_end_pos = gene.endPos + radius d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['radius'][radius] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['radius'][radius] = None print cvt_dict['radius'][radius] #Check TSS, 100kb, 50kb,25kb, 10kb,5kb,0kb, (all upstream) for dist in [200000, 100000, 50000, 25000, 10000, 5000, 1000]: print dist, gene_strand if gene_strand == '+': r_start_pos = max(gene.startPos - dist, 0) r_end_pos = gene.startPos else: r_start_pos = gene.endPos r_end_pos = gene.endPos + dist d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['tss_upstream'][dist] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['tss_upstream'][dist] = None print cvt_dict['tss_upstream'][dist] summary_dict['CVT'] = cvt_dict #Write info to file.. cPickle.dump(summary_dict, open(curr_file_prefix + '_info.pickled', 'w'), protocol=2) f_prefix = curr_file_prefix + '_hist' phed.plot_histogram(pid, title='Gene expressions for %s' % gene_tair_id, png_file=f_prefix + '.png', p_her=summary_dict['pseudo_heritability'], x_label='RNA seq expression levels (%s transformed)' % trans_type) #Plot GWAs... for res, method_name in [(kw_res, 'KW'), (lm_res, 'LM'), (ex_res, 'EX')]: res.filter_percentile(filter_threshold, reversed=True) res.write_to_file('%s_%s_.pvals' % (curr_file_prefix, method_name), only_pickled=True) if ex_res.min_score() < 10e-10: #print [cg.tairID for cg in cgs] f_prefix = '%s_%s_manhattan' % (curr_file_prefix, method_name) res.plot_manhattan(png_file=f_prefix + '.png', percentile=0, cand_genes=[gene], plot_bonferroni=True, neg_log_transform=True)
from util import kruskal_wallis if __name__ == '__main__': kruskal_wallis('timercontentloaded', int) kruskal_wallis('timerwindowload', int) kruskal_wallis('timerfirstinteraction', int) kruskal_wallis('timerfirstpaint', int)
def _perform_gwas_(phen_id,phenData,analysis_method,transformation,genotype,kinship_type,kinshipFile=None,messenger=None,outputfile=None): additional_columns = {} messenger.update_status(progress=0.0, task_status='Loading genotype data') genotypeData = dataParsers.load_snps_call_method(genotype) #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype) K = None messenger.update_status(step=0.05, task_status='Preparing data') n_filtered_snps = _prepare_data_(genotypeData,phenData,phen_id) phen_vals = phenData.get_values(phen_id) if analysis_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm','amm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if kinshipFile: #Kinship file was supplied.. messenger.update_status(progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile) print 'Loading supplied kinship file: %s' % kinshipFile K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions) else: messenger.update_status(progress=0.15, task_status='Loading kinship file') print 'Loading kinship file.' K = kinship.get_kinship(call_method_id=genotype, method=kinship_type, n_removed_snps=n_filtered_snps, remain_accessions=genotypeData.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") snps = genotypeData.getSnps() positions = genotypeData.getPositions() chromosomes = [] for i, (s, c) in enumerate(itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)): chromosomes.extend([c] * len(s.snps)) maf_dict = genotypeData.get_mafs() if analysis_method in ['kw']: messenger.update_status(progress=0.7, task_status='Performing KW') res = util.kruskal_wallis(snps, phen_vals) elif analysis_method in ['loc_glob_mm']: raise NotImplementedError elif analysis_method in ['emma']: res = lm.emma(snps, phen_vals, K) elif analysis_method in ['emmax','amm']: d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100) res = d['res'] #additional_columns['stats'] = d['stats'] elif analysis_method in ['lm']: d = lm.lin_reg_step(phen_vals, genotypeData, []) res = d['res'] #additional_columns['stats'] = d['stats'] else: raise Exception('analysis method %s not supported' % analysis_method) pvals = res['ps'] #Calculate Benjamini-Hochberg threshold bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05) #Calculate Median p-value med_pval = agr.calc_median(res['ps']) #Calculate the Kolmogorov-Smirnov statistic ks_res = agr.calc_ks_stats(res['ps']) quantiles_dict = _calculate_qqplot_data_(pvals) scores = map(lambda x:-math.log10(x), pvals) if analysis_method in ['lm', 'emma', 'emmax','amm']: additional_columns['genotype_var_perc'] = res['var_perc'] if 'betas' in res: betas = map(list, zip(*res['betas'])) additional_columns['beta0'] = betas[0] if len(betas) > 1: additional_columns['beta1'] = betas[1] #calculate ld if outputfile is None: outputfile = "%s.hdf5" % phen_id messenger.update_status(progress=0.8, task_status='Processing and saving results') _save_hdf5_pval_file(outputfile, analysis_method, transformation,chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], quantiles_dict,ks_res,bh_thres_d['thes_pval'],med_pval,additional_columns)
def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75, kinship_method='ibs', progress_file_writer=None): """ Performs GWAS and updates the datastructure. """ import bisect import gwa step_wise = False if analysis_method not in ['lm', 'emmax', 'kw']: raise Exception('analysis method %s not supported' % analysis_method) progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data') phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}}) phend.convert_to_averages() progress_file_writer.update_progress_bar(task_status='Loading genotype data') sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data') sd.coordinate_w_phenotype_data(phend, 1) progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs') sd.filter_monomorphic_snps() phen_vals = phend.get_values(1) snps = sd.getSnps() positions = sd.getPositions() chromosomes = [] progress_file_writer.set_step(0.03) for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)): progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes))) chromosomes.extend([c] * len(s.snps)) maf_dict = sd.get_mafs() kwargs = {} if analysis_method == 'emmax': progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix') k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions, scaled=True, min_mac=5, sd=sd) progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX') d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') res = d['res'] stats_dict = d['stats'] elif analysis_method == 'lm': progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM') res = lm.linear_model(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') elif analysis_method == 'kw': progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW') kw_res = util.kruskal_wallis(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') scores = map(lambda x:-math.log10(x), kw_res['ps']) self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds']) else: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method in ['lm', 'emmax']: if 'betas' in res: betas = map(list, zip(*res['betas'])) else: betas = [None, None] scores = map(lambda x:-math.log10(x), res['ps']) stats_dict['step'] = 0 cofactors = [stats_dict] self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1], cofactors=cofactors) progress_file_writer.update_progress_bar(progress=1.0, task_status='Done') print 'Done!' return analysis_method
def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1,test_type = "KW",): """ Leave one out test.. """ new_all_snps = [] for snp in all_snps: if snp.count(0)>1 and snp.count(1)>1: new_all_snps.append(snp) print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2." all_snps = new_all_snps if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." log_true_pvals = [] for pval in true_pvals: log_true_pvals.append(-math.log(pval,10)) perm_pvalues_list = [] for i in range(0,len(phenVals)): newPhenvals = phenVals[:] newPhenvals.pop(i) newSNPs = [] for snp in snps: newSNP = snp[:] newSNP.pop(i) newSNPs.append(newSNP) print i if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(newSNPs,newPhenvals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(newSNPs,newPhenvals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) delta_pvals_list = [] delta_log_pvals_list = [] for perm_pvals in perm_pvalues_list: log_pvals = [] delta_pvals = [] delta_log_pvals = [] for i in range(0,len(true_pvals)): pval = perm_pvals[i] true_pval = true_pvals[i] delta_pvals.append(true_pval-pval) log_true_pval = log_true_pvals[i] if pval > 0.0: log_pval = -math.log(pval,10) else: print "Damn those random 0 prob. events: event #", i log_pval = -math.log(true_pval,10) log_pvals.append(log_pval) delta_log_pvals.append(log_true_pval-log_pval) delta_pvals_list.append(delta_pvals) delta_log_pvals_list.append(delta_log_pvals) sd_log_pvals = [] sd_pvals = [] t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list)) t_delta_pvals_list = map(list,zip(*delta_pvals_list)) for i in range(0,len(true_pvals)): sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i])) sd_pvals.append(util.calcSD(t_delta_pvals_list[i])) #Write SDs out to file, to be able to replot, or plot together with other methods... etc import csv sd_log_pval_file = outputFile+".rob.log_pvals_sd" f = open(sd_log_pval_file,"w") w = csv.writer(f) w.writerow(["log_true_pval","sd_log_pvals"]) l = zip(log_true_pvals,sd_log_pvals) w.writerows(l) f.close() #Plot things.... pngFile_log_pvals = outputFile+".rob.log_pval.png" pngFile_pval = outputFile+".rob.pval.png" pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png" pngFile_sd_pval = outputFile+".rob.sd_pval.png" min_val = min(true_pvals) max_val = max(true_pvals) val_range = max_val-min_val min_log_val = min(log_true_pvals) max_log_val = max(log_true_pvals) log_val_range = max_val-min_val import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_log_pvals = delta_log_pvals_list[i] plt.plot(log_true_pvals,delta_log_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_log_pvals)) min_perm_val = min(min_perm_val,min(delta_log_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_log_pvals, format = "png") plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_pvals = delta_pvals_list[i] plt.plot(true_pvals,delta_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_pvals)) min_perm_val = min(min_perm_val,min(delta_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_log_pval = max(sd_log_pvals) min_sd_log_pval = min(sd_log_pvals) sd_val_range = max_sd_log_pval-min_sd_log_pval plt.plot(log_true_pvals,sd_log_pvals,"b.") plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_log_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_pval = max(sd_pvals) min_sd_pval = min(sd_pvals) sd_val_range = max_sd_pval-min_sd_pval plt.plot(true_pvals,sd_pvals,"b.") plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_pval, format = "png")
def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False): def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None): m = analyzePhenotype._calcMedian_(pvals,exp_median) ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals) s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0 ks_stat = ks_res["D"] ks_pvalue = ks_res["p.value"] quantiles = analyzePhenotype._getQuantiles_(pvals, 1000) #exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000) a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles) return (m,a,ks_stat,ks_pvalue,s) if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps #Calc norm stats, and est. p-value # print "running old KW" # t1 = time.time() # pvals = analyzeHaplotype._run_kw_(snps,phenVals) # t2 = time.time() # print "Took",t2-t1,"seconds." if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list = [] for i in range(0,numPerm):#For every perm if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) print i random.shuffle(phenVals) #Permute phenotype #pvals = analyzeHaplotype._run_kw_(snps,phenVals) #Run KW if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) print "Combining p-values" quantiles = [] all_pvals = [] for pvals in perm_pvalues_list: for pval in pvals: all_pvals.append(pval) print len(all_pvals),"permuted pvals in all" quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000) print "len(quantiles):", len(quantiles) exp_median = (quantiles[499]+quantiles[500])/2.0 (true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals) m_list = [] a_list = [] ks_stat_list = [] ks_pvalue_list = [] s_list = [] for i in range(0,numPerm): pvals = perm_pvalues_list[i] (m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic m_list.append(m) a_list.append(a) s_list.append(s) ks_stat_list.append(ks_stat) ks_pvalue_list.append(ks_pvalue) del all_pvals,quantiles if savePermutations: permOutputFile = outputFile+".perm.pvals" print "Writing to",permOutputFile f = open(permOutputFile,"w") i = 0 for pvals in perm_pvalues_list: pvals_str = map(str,pvals) f.write(",".join(pvals_str)+"\n") print "Done writing to",permOutputFile f.close() #Output results outputFile = outputFile+".perm.stat.txt" f = open(outputFile,"w") f.write("Perm_nr, median, area, ks_stat, s_stat \n") for i in range(0,numPerm): str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]]) f.write(", ".join(str_l)+"\n") f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n") pvals = [0.0,0.0,0.0,0.0] #M stat p-value (two sided) #Assuming symm. dist. for i in range(0,numPerm): if abs(true_m) <= abs(m_list[i]): pvals[0]+=1.0/numPerm #A stat p-value (one tailed) for i in range(0,numPerm): if true_a <= a_list[i]: pvals[1]+=1.0/numPerm #KS stat p-value (one tailed) for i in range(0,numPerm): if true_ks_stat <= ks_stat_list[i]: pvals[2]+=1.0/numPerm #S stat p-value (one tailed) for i in range(0,numPerm): if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)): pvals[3]+=1.0/numPerm for i in range(0,len(pvals)): if pvals[i] == 0.0: pvals[i] = 0.5*(1.0/numPerm) str_pvals = map(str,pvals) f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n") f.close() #Plot results pngFile_median = outputFile+".perm.m.png" pngFile_area = outputFile+".perm.a.png" pngFile_ks = outputFile+".perm.ks.png" pngFile_s = outputFile+".perm.s.png" def _getBinning_(n_bins,min_val,max_val): bins = [] delta = (max_val-min_val)/n_bins start_val = min_val-delta*0.5 for i in range(0,n_bins+2): bins.append(start_val+delta*i) return (bins,delta) n_bins = 20+int(4*(math.log(numPerm))) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt min_val = min(min(m_list),true_m) max_val = max(max(m_list),true_m) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val]) plt.hist([true_m], bins = bins)#, range=[start_val,end_val]) plt.savefig(pngFile_median, format = "png") plt.legend() plt.clf() min_val = min(min(a_list),true_a) max_val = max(max(a_list),true_a) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(a_list+[true_a], bins = bins) plt.hist([true_a], bins = bins) plt.savefig(pngFile_area, format = "png") plt.clf() min_val = min(min(ks_stat_list),true_ks_stat) max_val = max(max(ks_stat_list),true_ks_stat) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(ks_stat_list+[true_ks_stat], bins = bins) plt.hist([true_ks_stat], bins = bins) plt.savefig(pngFile_ks, format = "png") plt.clf() min_val = min(min(s_list),true_s) max_val = max(max(s_list),true_s) (bins,delta) = _getBinning_(n_bins,min_val,max_val) print (bins,delta) plt.figure(figsize=(10,7)) plt.hist(s_list+[true_s], bins = bins) plt.hist([true_s], bins = bins) plt.savefig(pngFile_s, format = "png") plt.clf()