Exemplos de kruskal_wallis em Python, exemplos de util.kruskal_wallis em Python

Exemplo n.º 1

0

Exibir arquivo

def run_kw(snpsd,phend,phen_i,chromosome,with_missing_vals=True):
	pvals = []
	print "Running KW on",len(snpsd.snps),"snps."
	if with_missing_vals: 
		snpsd_indices = []
		for i in range(len(snpsd.snps)):			
			(snp,phen_vals) = snpsd.get_snp_phen_pair(i,phend,phen_i,missingVal='NA')
#			print "Phen NA count:",phen_vals.count("NA")
#			print "SNP NA count:",snp.count("NA")
#			print "Running KW on",len(phen_vals),"phenotype values."
			if len(set(snp))>1 and len(set(phen_vals))>1:
				res = util.kruskal_wallis([snp],phen_vals)
				pvals.append(res["ps"][0])
				snpsd_indices.append(i)
			else:
				pvals.append(1)
		#snpsd.filter_snp_indices(snpsd_indices)
	else:
		res = util.kruskal_wallis(snpsd.snps,phen_vals)
		pvals = res["ps"]

	#print pvals
	gwas_result = gwaResults.Result(snpsds=[snpsd],name="KW_"+str(phen_i),phenotypeID=phen_i,scores=pvals,chromosomes=[chromosome]) 
	#gwas_result = gwaResults.Result(name="KW_"+str(phen_i),phenotypeID=phen_i,scores=pvals,chromosomes=[chromosome]) 
	#return {'ps':pvals,'positions':positions,'snpsd_indices':snpsd_indices}
	return gwas_result, snpsd

Exemplo n.º 2

0

Exibir arquivo

def get_perm_pvals(snps, phen_vals, mapping_method='kw', num_perm=100, snps_filter=0.05):
	import random
	if snps_filter < 1.0:
		snps = random.sample(snps, int(snps_filter * len(snps)))
	pvals = []
	if mapping_method == 'kw':
		for i in range(num_perm):
			random.shuffle(phen_vals)
			kw_res = util.kruskal_wallis(snps, phen_vals, verbose=False)
			pvals.extend(kw_res['ps'])
	elif mapping_method == 'ft':
		for i in range(num_perm):
			random.shuffle(phen_vals)
			pvals.extend(run_fet(snps, phen_vals))
	return pvals

Exemplo n.º 3

0

Exibir arquivo

def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1,test_type = "KW",):
	"""
	Leave one out test..
	"""
	
	new_all_snps = []
	for snp in all_snps:
		if snp.count(0)>1 and snp.count(1)>1:
			new_all_snps.append(snp)
	print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2."
	all_snps = new_all_snps

	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	
	log_true_pvals = []
	for pval in true_pvals:
		log_true_pvals.append(-math.log(pval,10))
	
	perm_pvalues_list = []
	for i in range(0,len(phenVals)):
		newPhenvals = phenVals[:]
		newPhenvals.pop(i)
				
		newSNPs = []
		for snp in snps:
			newSNP = snp[:]
			newSNP.pop(i)
			newSNPs.append(newSNP)
		
		print i
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(newSNPs,newPhenvals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(newSNPs,newPhenvals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)
		
		
	
	delta_pvals_list = []
	delta_log_pvals_list = []
	for perm_pvals in perm_pvalues_list:
		log_pvals = []
		delta_pvals = []
		delta_log_pvals = []
		for i in range(0,len(true_pvals)):
			pval = perm_pvals[i]
			true_pval = true_pvals[i]
			delta_pvals.append(true_pval-pval)

			log_true_pval = log_true_pvals[i]
			if pval > 0.0:
				log_pval = -math.log(pval,10)
			else:
				print "Damn those random 0 prob. events: event #", i
				log_pval = -math.log(true_pval,10)
				
			log_pvals.append(log_pval)
			delta_log_pvals.append(log_true_pval-log_pval)
		
		delta_pvals_list.append(delta_pvals)
		delta_log_pvals_list.append(delta_log_pvals)
	
	sd_log_pvals = []
	sd_pvals = []
	t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list))
	t_delta_pvals_list = map(list,zip(*delta_pvals_list))
	for i in range(0,len(true_pvals)):
		sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i]))
		sd_pvals.append(util.calcSD(t_delta_pvals_list[i]))
	
	
	#Write SDs out to file, to be able to replot, or plot together with other methods... etc
	import csv
	sd_log_pval_file = outputFile+".rob.log_pvals_sd"
	f = open(sd_log_pval_file,"w")
	w = csv.writer(f)
	w.writerow(["log_true_pval","sd_log_pvals"])
	l = zip(log_true_pvals,sd_log_pvals)
	w.writerows(l)
	f.close()

	
	#Plot things....
	pngFile_log_pvals = outputFile+".rob.log_pval.png"
	pngFile_pval = outputFile+".rob.pval.png"
	pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png"
	pngFile_sd_pval = outputFile+".rob.sd_pval.png"


	min_val = min(true_pvals)
	max_val = max(true_pvals)
	val_range = max_val-min_val

	min_log_val = min(log_true_pvals)
	max_log_val = max(log_true_pvals)
	log_val_range = max_val-min_val


	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	plt.figure(figsize=(10,7))
	max_perm_val = 0
	min_perm_val = 0
	for i in range(0,len(perm_pvalues_list)):
		delta_log_pvals = delta_log_pvals_list[i]
		plt.plot(log_true_pvals,delta_log_pvals,"b.")
		max_perm_val = max(max_perm_val,max(delta_log_pvals))
		min_perm_val = min(min_perm_val,min(delta_log_pvals))
	perm_val_range = max_perm_val - min_perm_val
	plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range])
	plt.savefig(pngFile_log_pvals, format = "png")

	plt.figure(figsize=(10,7))
	max_perm_val = 0
	min_perm_val = 0
	for i in range(0,len(perm_pvalues_list)):
		delta_pvals = delta_pvals_list[i]
		plt.plot(true_pvals,delta_pvals,"b.")
		max_perm_val = max(max_perm_val,max(delta_pvals))
		min_perm_val = min(min_perm_val,min(delta_pvals))
	perm_val_range = max_perm_val - min_perm_val
	plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range])
	plt.savefig(pngFile_pval, format = "png")

	plt.figure(figsize=(10,7))
	max_sd_log_pval = max(sd_log_pvals)
	min_sd_log_pval = min(sd_log_pvals)
	sd_val_range = max_sd_log_pval-min_sd_log_pval
	plt.plot(log_true_pvals,sd_log_pvals,"b.")
	plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range])
	plt.savefig(pngFile_sd_log_pval, format = "png")

	plt.figure(figsize=(10,7))
	max_sd_pval = max(sd_pvals)
	min_sd_pval = min(sd_pvals)
	sd_val_range = max_sd_pval-min_sd_pval
	plt.plot(true_pvals,sd_pvals,"b.")
	plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range])
	plt.savefig(pngFile_sd_pval, format = "png")

Exemplo n.º 4

0

Exibir arquivo

def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)

Exemplo n.º 5

0

Exibir arquivo

def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False):

	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
	
	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	#Calc norm stats, and est. p-value 
#	print "running old KW"
#	t1 = time.time()
#	pvals = analyzeHaplotype._run_kw_(snps,phenVals)
#	t2 = time.time()
#	print "Took",t2-t1,"seconds."
	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
		

	
	
	perm_pvalues_list = []
	for i in range(0,numPerm):#For every perm
		if filter <1.0:
			snps = random.sample(all_snps,int(len(all_snps)*filter))
			print "Number of SNPs:",len(snps)	
		print i
		random.shuffle(phenVals) #Permute phenotype
		#pvals = analyzeHaplotype._run_kw_(snps,phenVals)	#Run KW
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(snps,phenVals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(snps,phenVals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)

	
	print "Combining p-values"
	quantiles = []
	all_pvals = []
	for pvals in perm_pvalues_list:
		for pval in pvals:
			all_pvals.append(pval)
	print len(all_pvals),"permuted pvals in all"
	quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000)
	print "len(quantiles):", len(quantiles)
	exp_median = (quantiles[499]+quantiles[500])/2.0

	(true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals)

	m_list = []
	a_list = []
	ks_stat_list = []
	ks_pvalue_list = []
	s_list = []
	for i in range(0,numPerm):
		pvals = perm_pvalues_list[i]
		(m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic
		m_list.append(m)
		a_list.append(a)
		s_list.append(s)
		ks_stat_list.append(ks_stat)
		ks_pvalue_list.append(ks_pvalue)
	
	del all_pvals,quantiles

		
	if savePermutations:
		permOutputFile = outputFile+".perm.pvals"
		print "Writing to",permOutputFile
		f = open(permOutputFile,"w")
		i = 0
		for pvals in perm_pvalues_list:
			pvals_str = map(str,pvals)
			f.write(",".join(pvals_str)+"\n")
		print "Done writing to",permOutputFile
	
		f.close()


	#Output results
	outputFile = outputFile+".perm.stat.txt"
	f = open(outputFile,"w")
	f.write("Perm_nr, median, area, ks_stat, s_stat \n")
	for i in range(0,numPerm):
		str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]])
		f.write(", ".join(str_l)+"\n")
	
	f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n")

	pvals = [0.0,0.0,0.0,0.0]
	
	#M stat p-value (two sided)
	#Assuming symm. dist.
	for i in range(0,numPerm):
		if abs(true_m) <= abs(m_list[i]):
			pvals[0]+=1.0/numPerm
	
	#A stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_a <= a_list[i]:
			pvals[1]+=1.0/numPerm
		

	#KS stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_ks_stat <= ks_stat_list[i]:
			pvals[2]+=1.0/numPerm
		
	#S stat p-value (one tailed)
	for i in range(0,numPerm):
		if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)):
			pvals[3]+=1.0/numPerm
		


	for i in range(0,len(pvals)):
		if pvals[i] == 0.0:
			pvals[i] = 0.5*(1.0/numPerm)
			
	str_pvals = map(str,pvals)
	f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n")
	f.close()
	
	#Plot results
	pngFile_median = outputFile+".perm.m.png"
	pngFile_area = outputFile+".perm.a.png"
	pngFile_ks = outputFile+".perm.ks.png"
	pngFile_s = outputFile+".perm.s.png"
	
	def _getBinning_(n_bins,min_val,max_val):
		bins = []
		delta = (max_val-min_val)/n_bins
		start_val = min_val-delta*0.5
		for i in range(0,n_bins+2):
			bins.append(start_val+delta*i)
		return (bins,delta)
		
	n_bins = 20+int(4*(math.log(numPerm)))
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	min_val = min(min(m_list),true_m)
	max_val = max(max(m_list),true_m)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)

	plt.figure(figsize=(10,7))
	plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val])
	plt.hist([true_m], bins = bins)#, range=[start_val,end_val])
	plt.savefig(pngFile_median, format = "png")
	plt.legend()
	plt.clf()

	min_val = min(min(a_list),true_a)
	max_val = max(max(a_list),true_a)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(a_list+[true_a], bins = bins)
	plt.hist([true_a], bins = bins)
	plt.savefig(pngFile_area, format = "png")
	plt.clf()

	min_val = min(min(ks_stat_list),true_ks_stat)
	max_val = max(max(ks_stat_list),true_ks_stat)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(ks_stat_list+[true_ks_stat], bins = bins)
	plt.hist([true_ks_stat], bins = bins)
	plt.savefig(pngFile_ks, format = "png")
	plt.clf()

	min_val = min(min(s_list),true_s)
	max_val = max(max(s_list),true_s)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(s_list+[true_s], bins = bins)
	plt.hist([true_s], bins = bins)
	plt.savefig(pngFile_s, format = "png")
	plt.clf()

Exemplo n.º 6

0

Exibir arquivo

from util import kruskal_wallis

if __name__ == '__main__':
    kruskal_wallis('opentabs', int)
    kruskal_wallis('openwindows', int)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: atpipeline.py Projeto: Gregor-Mendel-Institute/GWAPP

def _perform_gwas_(phen_id,
                   phenData,
                   analysis_method,
                   transformation,
                   genotype,
                   kinship_type,
                   kinshipFile=None,
                   messenger=None,
                   outputfile=None):
    additional_columns = {}
    messenger.update_status(progress=0.0, task_status='Loading genotype data')
    genotypeData = dataParsers.load_snps_call_method(genotype)
    #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype)
    K = None
    messenger.update_status(step=0.05, task_status='Preparing data')
    n_filtered_snps = _prepare_data_(genotypeData, phenData, phen_id)
    phen_vals = phenData.get_values(phen_id)
    if analysis_method in [
            'emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm', 'amm'
    ]:
        #Load genotype file (in binary format)
        sys.stdout.write("Retrieving the Kinship matrix K.\n")
        sys.stdout.flush()
        if kinshipFile:  #Kinship file was supplied..
            messenger.update_status(
                progress=0.15,
                task_status='Loading supplied kinship file: %s' % kinshipFile)
            print 'Loading supplied kinship file: %s' % kinshipFile
            K = kinship.load_kinship_from_file(kinshipFile,
                                               genotypeData.accessions)
        else:
            messenger.update_status(progress=0.15,
                                    task_status='Loading kinship file')
            print 'Loading kinship file.'
            K = kinship.get_kinship(call_method_id=genotype,
                                    method=kinship_type,
                                    n_removed_snps=n_filtered_snps,
                                    remain_accessions=genotypeData.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

    snps = genotypeData.getSnps()
    positions = genotypeData.getPositions()
    chromosomes = []
    for i, (s, c) in enumerate(
            itertools.izip(genotypeData.snpsDataList,
                           genotypeData.chromosomes)):
        chromosomes.extend([c] * len(s.snps))
        maf_dict = genotypeData.get_mafs()

    if analysis_method in ['kw']:
        messenger.update_status(progress=0.7, task_status='Performing KW')
        res = util.kruskal_wallis(snps, phen_vals)

    elif analysis_method in ['loc_glob_mm']:
        raise NotImplementedError
    elif analysis_method in ['emma']:
        res = lm.emma(snps, phen_vals, K)
    elif analysis_method in ['emmax', 'amm']:
        d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100)
        res = d['res']
        #additional_columns['stats'] = d['stats']
    elif analysis_method in ['lm']:
        d = lm.lin_reg_step(phen_vals, genotypeData, [])
        res = d['res']
        #additional_columns['stats'] = d['stats']
    else:
        raise Exception('analysis method %s not supported' % analysis_method)

    pvals = res['ps']

    #Calculate Benjamini-Hochberg threshold
    bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05)
    #Calculate Median p-value
    med_pval = agr.calc_median(res['ps'])
    #Calculate the Kolmogorov-Smirnov statistic
    ks_res = agr.calc_ks_stats(res['ps'])

    quantiles_dict = _calculate_qqplot_data_(pvals)
    scores = map(lambda x: -math.log10(x), pvals)

    if analysis_method in ['lm', 'emma', 'emmax', 'amm']:
        additional_columns['genotype_var_perc'] = res['var_perc']
        if 'betas' in res:
            betas = map(list, zip(*res['betas']))
            additional_columns['beta0'] = betas[0]
            if len(betas) > 1:
                additional_columns['beta1'] = betas[1]

    #calculate ld
    if outputfile is None:
        outputfile = "%s.hdf5" % phen_id
    messenger.update_status(progress=0.8,
                            task_status='Processing and saving results')
    _save_hdf5_pval_file(outputfile, analysis_method, transformation,
                         chromosomes, positions, scores, maf_dict['marfs'],
                         maf_dict['mafs'], quantiles_dict, ks_res,
                         bh_thres_d['thes_pval'], med_pval, additional_columns)

Exemplo n.º 8

0

Exibir arquivo

def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict):
    import copy
    phed = copy.deepcopy(phed)
    phenotype_name = phed.get_name(p_i)
    phen_is_binary = phed.is_binary(p_i)
    if trans_method == 'most_normal':
        trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False)
    file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i),
                mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'],
                p_dict['call_method_id'])
    result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method)
    emmax_perm_threshold = None
    k = None

    res = None
    #Check whether result already exists.
    if p_dict['use_existing_results']:
        if p_dict['region_plots']:
            sd = _get_genotype_data_(p_dict)
            num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'])
            if p_dict['remove_outliers']:
                assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."

            snps = sd.getSnps()
        else:
            snps = None

        print "\nChecking for existing results."
        result_file = file_prefix + ".pvals"
        if os.path.isfile(result_file):
            res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
            pvals = True
        else:
            result_file = file_prefix + ".scores"
            if os.path.isfile(result_file):
                res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
                pvals = False
        if res:
            print "Found existing results.. (%s)" % (result_file)

        sys.stdout.flush()


    #Loading candidate genes
    cand_genes = None
    if p_dict['cand_genes_file']:
        cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file'])
    else:
        cand_genes = None
        tair_ids = None

    if not res: #If results weren't found in a file... then do GWA.
        #Loading data
        sd = _get_genotype_data_(p_dict)
        num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'],
                                                     p_dict['with_replicates'])

        #Do we need to calculate the K-matrix?
        if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']:
            #Load genotype file (in binary format)
            sys.stdout.write("Retrieving the Kinship matrix K.\n")
            sys.stdout.flush()
            if p_dict['kinship_file']:   #Kinship file was supplied..
                print 'Loading supplied kinship file: %s' % p_dict['kinship_file']
                k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions)
            else:
                print 'Loading kinship file.'
                if p_dict['data_file'] != None:
                    if p_dict['kinship_type'] == 'ibs':
                        k = sd.get_ibs_kinship_matrix()
                    elif p_dict['kinship_type'] == 'ibd':
                        k = sd.get_ibd_kinship_matrix()
                else:
                    k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'],
                                            method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps,
                                            remain_accessions=sd.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

        if p_dict['remove_outliers']:
            if num_outliers == 0: print "No outliers were removed!"

        phen_vals = phed.get_values(p_i)

        if p_dict['local_gwas']: #Filter SNPs, etc..
            sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])],
                        [p_dict['local_gwas'][0]], data_format=sd.data_format)
        snps = sd.getSnps()


        sys.stdout.write("Finished loading and handling data!\n")

        print "Plotting a histogram"
        p_her = None
        hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method,
                        p_dict['remove_outliers'], p_dict['with_replicates'],
                        p_dict['call_method_id'])
        hist_png_file = hist_file_prefix + "_hist.png"
        if k is not None:
            p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability']
            p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval']
            phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval)
        else:
            phed.plot_histogram(p_i, png_file=hist_png_file)


        print "Applying %s to data." % (mapping_method)
        sys.stdout.flush()
        kwargs = {}
        additional_columns = []
        if "kw" == mapping_method:

            if phen_is_binary:
                warnings.warn("Warning, applying KW to a binary phenotype")

            kw_res = util.kruskal_wallis(snps, phen_vals)
            pvals = kw_res['ps']
            kwargs['statistics'] = kw_res['ds']
            additional_columns.append('statistics')


        elif "ft" == mapping_method:
            raise NotImplementedError
#            pvals, or_est = run_fet(snps, phen_vals)
#            kwargs['odds_ratio_est'] = or_est
#            additional_columns.append('odds_ratio_est')

        else:  #Parametric tests below:        

            if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']:
                r = lm.mm_lrt_test(phen_vals, k)
                if r['pval'] > 0.05:
                    print "Performing EMMA, even though a mixed model does not fit the data significantly better"
                    print 'p-value: %0.3f' % r['pval']
                else:
                    print 'The mixed model fits the data significantly better than the simple linear model.'
                    print 'p-value: %f' % r['pval']

            if mapping_method in ['loc_glob_mm']:
                res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix,
                            global_k=k, window_size=p_dict['loc_glob_ws'],
                            jump_size=p_dict['loc_glob_ws'] / 2,
                            kinship_method=p_dict['kinship_type'])
                res_file_name = file_prefix + '.csv'
                _write_res_dict_to_file_(res_file_name, res_dict)
                return
            elif mapping_method in ['emma']:
                res = lm.emma(snps, phen_vals, k)
            elif mapping_method in ['emmax']:
                if p_dict['emmax_perm']:
                    perm_sd = _get_genotype_data_(p_dict)
                    num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates'])
                    perm_sd.filter_mac_snps(p_dict['mac_threshold'])
                    t_snps = perm_sd.getSnps()
                    t_phen_vals = phed.get_values(p_i)
                    res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm'])
                    emmax_perm_threshold = res['threshold_05'][0]
                    import pylab
                    hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6)
                    threshold = -sp.log10(emmax_perm_threshold)
                    b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0))
                    pylab.vlines(threshold, 0, max(hist_res[0]), color='g')
                    pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r')
                    pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']),
                        format='png')
                if p_dict['with_replicates']:
                    #Get values, with ecotypes, construct Z and do GWAM
                    phen_vals = phed.get_values(p_i)
                    Z = phed.get_incidence_matrix(p_i)
                    res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])
                else:
                    res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])

            elif mapping_method in ['emmax_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'],
                            emma_num=p_dict['emmax_emma_num'])
                print 'Step-wise EMMAX finished!'
                return
            elif mapping_method in ['lm_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'])
                print 'Step-wise LM finished!'
                return
            elif mapping_method in ['lm']:
                res = lm.linear_model(snps, phen_vals)
            elif mapping_method in ['emmax_anova']:
                res = lm.emmax_anova(snps, phen_vals, k)
            elif mapping_method in ['lm_anova']:
                res = lm.anova(snps, phen_vals)
            else:
                print "Mapping method", mapping_method, 'was not found.'
                return

            if mapping_method in ['lm', 'emma', 'emmax']:
                kwargs['genotype_var_perc'] = res['var_perc']
                additional_columns.append('genotype_var_perc')
                if p_dict['with_betas'] or mapping_method in ['emma' ]:
                    betas = map(list, zip(*res['betas']))
                    kwargs['beta0'] = betas[0]
                    additional_columns.append('beta0')
                    if len(betas) > 1:
                        kwargs['beta1'] = betas[1]
                        additional_columns.append('beta1')
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()

            if mapping_method in ['lm_anova', 'emmax_anova']:
                kwargs['genotype_var_perc'] = res['var_perc']
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()


#        print 'Calculating SNP-phenotype correlations.'
#        kwargs['correlations'] = calc_correlations(snps, phen_vals)
#        additional_columns.append('correlations')
        print 'Writing result to file.'
        res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs)
        if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']:
            result_file = file_prefix + ".pvals"
        else:
            result_file = file_prefix + ".scores"
        res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter'])

    #add results to DB..

    if p_dict['add_to_db']:
        print 'Adding results to DB.'
        if p_dict['with_db_ids']:
            db_pid = p_i
        else:
            db_pid = phed.get_db_pid(p_i)

        import results_2_db as rdb

        short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name,
                            mapping_method, trans_method, p_dict['remove_outliers'],
                            str(p_dict['with_replicates']))
        tm_id = transformation_method_dict[trans_method]
        try:
            rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid,
                        analysis_methods_dict[mapping_method],
                        tm_id, remove_outliers=p_dict['remove_outliers'])
        except Exception, err_str:
            print 'Failed inserting results into DB!'
            print err_str

Exemplo n.º 9

0

Exibir arquivo

Arquivo: analyze_rna_seq.py Projeto: bopopescu/gwasmodules

def run_gwas(file_prefix, phen_file, start_i, stop_i, temperature, mac_threshold=15, filter_threshold=0.02,
		call_method_id=79, data_format='diploid_int', debug_filter=1.0, near_const_filter=20):
	"""
	GWAS
	"""
	phed = pd.parse_phenotype_file(phen_file, with_db_ids=False)  #load phenotype file
	phed.filter_near_const_phens(near_const_filter)
	phed.convert_to_averages()
	num_traits = phed.num_traits()
	pids = phed.phen_ids[start_i :stop_i]
	sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter)
	indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False)  #All phenotypes are ordered the same way, so we pick the first one.
	phed.filter_ecotypes(indices_to_keep, pids=pids)
	print len(sd.accessions)
	K = sd.get_ibs_kinship_matrix()
	#K = dp.load_kinship(call_method_id=call_method_id, data_format=data_format, sd=sd, method='ibs')

	sd.filter_mac_snps(mac_threshold)
	snps = sd.getSnps()
	positions = sd.getPositions()
	chromosomes = sd.get_chr_list()
	r = sd.get_mafs()
	macs = r['mafs']
	mafs = r['marfs']

	print 'In total there are %d SNPs to be mapped.' % len(snps)
	gene_dict = dp.parse_tair_gff_file()#_load_genes_list_('rna_seq_031311_%sC' % temperature)
	for i, pid in enumerate(pids):
		if not pid in phed.phen_ids: continue
		gene_tair_id = phed.get_name(pid)
#		exons = []
#		for isoform in d:
#			for exon in isoform['exons']:
#				exons.append((d['chromosome'], exon['start_pos'], exon['end_pos']))


		d = gene_dict[gene_tair_id]
		gene_strand = d['strand']
		try:
			chrom = int(d['chromosome'])
		except Exception:
			raise
		gene = gwaResults.Gene(chromosome=int(d['chromosome']), startPos=d['start_pos'],
				endPos=d['end_pos'], name=gene_tair_id, description=None, dbRef=gene_tair_id,
				tairID=gene_tair_id)
		print i, pid, gene
		curr_file_prefix = '%s_mac%d_pid%d_%s' % (file_prefix, mac_threshold, pid, gene_tair_id)

		trans_type, shapiro_pval = phed.most_normal_transformation(pid)
		print 'Most normal transformation was: %s' % trans_type
		#trans_type = 'None'
		summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':shapiro_pval}
		#summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':0}


		print'Applying Kruskal-Wallis'
		phen_vals = phed.get_values(pid)
		res = util.kruskal_wallis(snps, phen_vals)
		pvals = res['ps'].tolist()
		kw_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes)
		print 'Summarizing KW'
		summary_dict['KW'] = kw_res.get_gene_analysis(gene)
		summary_dict['KW']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps'])
		summary_dict['KW']['pval_median'] = agr.calc_median(res['ps'])


		print 'Applying LM'
		res = lm.linear_model(snps, phen_vals)
		pvals = res['ps'].tolist()
		perc_var_expl = res['var_perc'].tolist()
		lm_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes,
				perc_var_expl=perc_var_expl)
		print 'Summarizing LM'
		summary_dict['LM'] = lm_res.get_gene_analysis(gene)
		summary_dict['LM']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps'])
		summary_dict['LM']['pval_median'] = agr.calc_median(res['ps'])


		print 'Applying EX Stepwise'
		snp_priors = sd.get_cand_genes_snp_priors([gene])
		ex_sw_res = lm.emmax_step_wise(phen_vals, K, macs=macs, mafs=mafs, positions=positions,
					chromosomes=chromosomes, snps=snps, num_steps=5, cand_gene_list=[gene],
					with_qq_plots=False, log_qq_max_val=6.0, save_pvals=True, snp_priors=snp_priors)
		print 'Summarizing the step-wise mixed model'
		pvals = ex_sw_res['first_emmax_res']['ps'].tolist()
		perc_var_expl = ex_sw_res['first_emmax_res']['var_perc'].tolist()
		ex_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes,
				perc_var_expl=perc_var_expl)
		summary_dict['EX'] = ex_res.get_gene_analysis(gene)
		summary_dict['pseudo_heritability'] = ex_sw_res['step_info_list'][0]['pseudo_heritability']
		summary_dict['EX']['kolmogorov_smirnov'] = agr.calc_ks_stats(ex_sw_res['first_emmax_res']['ps'])
		summary_dict['EX']['pval_median'] = agr.calc_median(ex_sw_res['first_emmax_res']['ps'])

		#Does the linear mixed model fit the data better?
		summary_dict['MM_LRT'] = lm.mm_lrt_test(phen_vals, K)

		#FINISH summarizing the stepwise!!!
		summarize_stepwise(summary_dict, gene, ex_sw_res['step_info_list'], ex_sw_res['opt_dict'])

		cvt_dict = {'radius':{}, 'tss_upstream':{}}
		print 'Comparing cis vs. trans kinship'
		#Check 1 mb, 200kb, 100kb, 50kb, 20kb, 10kb, 2kb, 0kb
		for radius in [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]:
			print radius
			r_start_pos = max(gene.startPos - radius, 0)
			r_end_pos = gene.endPos + radius
			d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)],
							kinship_method='ibs', global_kinship=K)
			reg_k = d['regional_k']
			glob_k = d['global_k']
			if reg_k != None:
				cvt_dict['radius'][radius] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K)
			else:
				cvt_dict['radius'][radius] = None
			print cvt_dict['radius'][radius]

		#Check TSS, 100kb, 50kb,25kb, 10kb,5kb,0kb, (all upstream)
		for dist in [200000, 100000, 50000, 25000, 10000, 5000, 1000]:
			print dist, gene_strand
			if gene_strand == '+':
				r_start_pos = max(gene.startPos - dist, 0)
				r_end_pos = gene.startPos
			else:
				r_start_pos = gene.endPos
				r_end_pos = gene.endPos + dist
			d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)],
							kinship_method='ibs', global_kinship=K)
			reg_k = d['regional_k']
			glob_k = d['global_k']
			if reg_k != None:
				cvt_dict['tss_upstream'][dist] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K)
			else:
				cvt_dict['tss_upstream'][dist] = None
			print cvt_dict['tss_upstream'][dist]

		summary_dict['CVT'] = cvt_dict

		#Write info to file..
		cPickle.dump(summary_dict, open(curr_file_prefix + '_info.pickled', 'w'), protocol=2)

		f_prefix = curr_file_prefix + '_hist'
		phed.plot_histogram(pid, title='Gene expressions for %s' % gene_tair_id,
				png_file=f_prefix + '.png', p_her=summary_dict['pseudo_heritability'],
				x_label='RNA seq expression levels (%s transformed)' % trans_type)
		#Plot GWAs...
		for res, method_name in [(kw_res, 'KW'), (lm_res, 'LM'), (ex_res, 'EX')]:
			res.filter_percentile(filter_threshold, reversed=True)
			res.write_to_file('%s_%s_.pvals' % (curr_file_prefix, method_name), only_pickled=True)
			if ex_res.min_score() < 10e-10:
				#print [cg.tairID for cg in cgs]
				f_prefix = '%s_%s_manhattan' % (curr_file_prefix, method_name)
				res.plot_manhattan(png_file=f_prefix + '.png', percentile=0, cand_genes=[gene],
						plot_bonferroni=True, neg_log_transform=True)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: timers.py Projeto: db48x/pulse

from util import kruskal_wallis

if __name__ == '__main__':
    kruskal_wallis('timercontentloaded', int)
    kruskal_wallis('timerwindowload', int)
    kruskal_wallis('timerfirstinteraction', int)
    kruskal_wallis('timerfirstpaint', int)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: atpipeline.py Projeto: timeu/ATpipeline

def _perform_gwas_(phen_id,phenData,analysis_method,transformation,genotype,kinship_type,kinshipFile=None,messenger=None,outputfile=None):
    additional_columns = {}
    messenger.update_status(progress=0.0, task_status='Loading genotype data')
    genotypeData = dataParsers.load_snps_call_method(genotype)
    #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype)
    K = None
    messenger.update_status(step=0.05, task_status='Preparing data')
    n_filtered_snps = _prepare_data_(genotypeData,phenData,phen_id)
    phen_vals = phenData.get_values(phen_id)
    if analysis_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm','amm']:
        #Load genotype file (in binary format)
        sys.stdout.write("Retrieving the Kinship matrix K.\n")
        sys.stdout.flush()
        if kinshipFile:   #Kinship file was supplied..
            messenger.update_status(progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile)
            print 'Loading supplied kinship file: %s' % kinshipFile
            K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions)
        else:
            messenger.update_status(progress=0.15, task_status='Loading kinship file')
            print 'Loading kinship file.'
            K = kinship.get_kinship(call_method_id=genotype,
                                            method=kinship_type, n_removed_snps=n_filtered_snps,
                                            remain_accessions=genotypeData.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

    snps = genotypeData.getSnps()
    positions = genotypeData.getPositions()
    chromosomes = []
    for i, (s, c) in enumerate(itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)):
        chromosomes.extend([c] * len(s.snps))
        maf_dict = genotypeData.get_mafs()
    
    if analysis_method in ['kw']:
        messenger.update_status(progress=0.7, task_status='Performing KW')
        res = util.kruskal_wallis(snps, phen_vals)
        
    elif analysis_method in ['loc_glob_mm']:
        raise NotImplementedError
    elif analysis_method in ['emma']:
        res = lm.emma(snps, phen_vals, K)
    elif analysis_method in ['emmax','amm']:
        d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100)
        res = d['res']
        #additional_columns['stats'] = d['stats']
    elif analysis_method in ['lm']:
        d = lm.lin_reg_step(phen_vals, genotypeData, [])
        res = d['res']
        #additional_columns['stats'] = d['stats']
    else:
        raise Exception('analysis method %s not supported' % analysis_method)
    
    pvals = res['ps']
    
    #Calculate Benjamini-Hochberg threshold
    bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05)
    #Calculate Median p-value
    med_pval = agr.calc_median(res['ps'])
    #Calculate the Kolmogorov-Smirnov statistic
    ks_res = agr.calc_ks_stats(res['ps'])
    
    quantiles_dict = _calculate_qqplot_data_(pvals)
    scores = map(lambda x:-math.log10(x), pvals)
    
    if analysis_method in ['lm', 'emma', 'emmax','amm']:
        additional_columns['genotype_var_perc'] = res['var_perc']
        if 'betas' in res:
            betas = map(list, zip(*res['betas']))
            additional_columns['beta0'] = betas[0]
            if len(betas) > 1:
                additional_columns['beta1'] = betas[1]
    
    #calculate ld
    if outputfile is None:
         outputfile = "%s.hdf5" % phen_id
    messenger.update_status(progress=0.8, task_status='Processing and saving results')
    _save_hdf5_pval_file(outputfile, analysis_method, transformation,chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], 
                         quantiles_dict,ks_res,bh_thres_d['thes_pval'],med_pval,additional_columns)

Exemplo n.º 12

0

Exibir arquivo

    def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75,
                     kinship_method='ibs', progress_file_writer=None):

        """
        Performs GWAS and updates the datastructure.
        """

        import bisect
        import gwa
        step_wise = False
        if analysis_method not in ['lm', 'emmax', 'kw']:
            raise Exception('analysis method %s not supported' % analysis_method)

        progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data')
        phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype
        phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}})
        phend.convert_to_averages()
        progress_file_writer.update_progress_bar(task_status='Loading genotype data')
        sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data
        progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data')
        sd.coordinate_w_phenotype_data(phend, 1)
        progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs')
        sd.filter_monomorphic_snps()
        phen_vals = phend.get_values(1)
        snps = sd.getSnps()
        positions = sd.getPositions()
        chromosomes = []
        progress_file_writer.set_step(0.03)
        for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)):
            progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes)))
            chromosomes.extend([c] * len(s.snps))
        maf_dict = sd.get_mafs()
		

        kwargs = {}
        if analysis_method == 'emmax':
            progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix')
            k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions,
                                scaled=True, min_mac=5, sd=sd)
            progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX')
            d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            res = d['res']
            stats_dict = d['stats']
        elif analysis_method == 'lm':
            progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM')
            res = lm.linear_model(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
        elif analysis_method == 'kw':
            progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW')
            kw_res = util.kruskal_wallis(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            scores = map(lambda x:-math.log10(x), kw_res['ps'])
            self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                    maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds'])
        else:
            raise Exception('analysis method %s not supported' % analysis_method)

        if analysis_method in ['lm', 'emmax']:
            if 'betas' in res:
                betas = map(list, zip(*res['betas']))
            else:
                betas = [None, None]
            scores = map(lambda x:-math.log10(x), res['ps'])
            stats_dict['step'] = 0
            cofactors = [stats_dict]
            self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                             maf_dict['mafs'], transformation=transformation,
                             genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1],
                              cofactors=cofactors)
        progress_file_writer.update_progress_bar(progress=1.0, task_status='Done')
        print 'Done!'
        return analysis_method

Exemplo n.º 13

0

Exibir arquivo

def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1,test_type = "KW",):
	"""
	Leave one out test..
	"""
	
	new_all_snps = []
	for snp in all_snps:
		if snp.count(0)>1 and snp.count(1)>1:
			new_all_snps.append(snp)
	print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2."
	all_snps = new_all_snps

	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	
	log_true_pvals = []
	for pval in true_pvals:
		log_true_pvals.append(-math.log(pval,10))
	
	perm_pvalues_list = []
	for i in range(0,len(phenVals)):
		newPhenvals = phenVals[:]
		newPhenvals.pop(i)
				
		newSNPs = []
		for snp in snps:
			newSNP = snp[:]
			newSNP.pop(i)
			newSNPs.append(newSNP)
		
		print i
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(newSNPs,newPhenvals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(newSNPs,newPhenvals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)
		
		
	
	delta_pvals_list = []
	delta_log_pvals_list = []
	for perm_pvals in perm_pvalues_list:
		log_pvals = []
		delta_pvals = []
		delta_log_pvals = []
		for i in range(0,len(true_pvals)):
			pval = perm_pvals[i]
			true_pval = true_pvals[i]
			delta_pvals.append(true_pval-pval)

			log_true_pval = log_true_pvals[i]
			if pval > 0.0:
				log_pval = -math.log(pval,10)
			else:
				print "Damn those random 0 prob. events: event #", i
				log_pval = -math.log(true_pval,10)
				
			log_pvals.append(log_pval)
			delta_log_pvals.append(log_true_pval-log_pval)
		
		delta_pvals_list.append(delta_pvals)
		delta_log_pvals_list.append(delta_log_pvals)
	
	sd_log_pvals = []
	sd_pvals = []
	t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list))
	t_delta_pvals_list = map(list,zip(*delta_pvals_list))
	for i in range(0,len(true_pvals)):
		sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i]))
		sd_pvals.append(util.calcSD(t_delta_pvals_list[i]))
	
	
	#Write SDs out to file, to be able to replot, or plot together with other methods... etc
	import csv
	sd_log_pval_file = outputFile+".rob.log_pvals_sd"
	f = open(sd_log_pval_file,"w")
	w = csv.writer(f)
	w.writerow(["log_true_pval","sd_log_pvals"])
	l = zip(log_true_pvals,sd_log_pvals)
	w.writerows(l)
	f.close()

	
	#Plot things....
	pngFile_log_pvals = outputFile+".rob.log_pval.png"
	pngFile_pval = outputFile+".rob.pval.png"
	pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png"
	pngFile_sd_pval = outputFile+".rob.sd_pval.png"


	min_val = min(true_pvals)
	max_val = max(true_pvals)
	val_range = max_val-min_val

	min_log_val = min(log_true_pvals)
	max_log_val = max(log_true_pvals)
	log_val_range = max_val-min_val


	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	plt.figure(figsize=(10,7))
	max_perm_val = 0
	min_perm_val = 0
	for i in range(0,len(perm_pvalues_list)):
		delta_log_pvals = delta_log_pvals_list[i]
		plt.plot(log_true_pvals,delta_log_pvals,"b.")
		max_perm_val = max(max_perm_val,max(delta_log_pvals))
		min_perm_val = min(min_perm_val,min(delta_log_pvals))
	perm_val_range = max_perm_val - min_perm_val
	plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range])
	plt.savefig(pngFile_log_pvals, format = "png")

	plt.figure(figsize=(10,7))
	max_perm_val = 0
	min_perm_val = 0
	for i in range(0,len(perm_pvalues_list)):
		delta_pvals = delta_pvals_list[i]
		plt.plot(true_pvals,delta_pvals,"b.")
		max_perm_val = max(max_perm_val,max(delta_pvals))
		min_perm_val = min(min_perm_val,min(delta_pvals))
	perm_val_range = max_perm_val - min_perm_val
	plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range])
	plt.savefig(pngFile_pval, format = "png")

	plt.figure(figsize=(10,7))
	max_sd_log_pval = max(sd_log_pvals)
	min_sd_log_pval = min(sd_log_pvals)
	sd_val_range = max_sd_log_pval-min_sd_log_pval
	plt.plot(log_true_pvals,sd_log_pvals,"b.")
	plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range])
	plt.savefig(pngFile_sd_log_pval, format = "png")

	plt.figure(figsize=(10,7))
	max_sd_pval = max(sd_pvals)
	min_sd_pval = min(sd_pvals)
	sd_val_range = max_sd_pval-min_sd_pval
	plt.plot(true_pvals,sd_pvals,"b.")
	plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range])
	plt.savefig(pngFile_sd_pval, format = "png")

Exemplo n.º 14

0

Exibir arquivo

def _perm_test_(all_snps,phenVals,numPerm,outputFile,filter=0.1,test_type = "KW",savePermutations=False,useSameSnps=False):

	def _calc_statistics_(pvals,exp_quantiles,exp_median=0.5,exp_pvals=None):
		m = analyzePhenotype._calcMedian_(pvals,exp_median)
		ks_res = analyzePhenotype._calcKS_(pvals,exp_pvals)
		s = analyzePhenotype._estLogSlope_(pvals,exp_pvals)-1.0
		ks_stat = ks_res["D"]
		ks_pvalue = ks_res["p.value"]
		quantiles = analyzePhenotype._getQuantiles_(pvals, 1000)
		#exp_quantiles = analyzePhenotype.__getExpectedPvalueQuantiles__(1000)
		a = analyzePhenotype._estAreaBetweenCurves_(quantiles,exp_quantiles)
		
		return (m,a,ks_stat,ks_pvalue,s)
	
	if filter <1.0:
		snps = random.sample(all_snps,int(len(all_snps)*filter))
		print "Number of SNPs:",len(snps)
	else:
		snps = all_snps 

	#Calc norm stats, and est. p-value 
#	print "running old KW"
#	t1 = time.time()
#	pvals = analyzeHaplotype._run_kw_(snps,phenVals)
#	t2 = time.time()
#	print "Took",t2-t1,"seconds."
	if test_type=="KW":
		print "running KW"
		t1 = time.time()
		true_pvals = util.kruskal_wallis(snps,phenVals)["ps"]
		t2 = time.time()
		print "Took",t2-t1,"seconds."
	elif test_type=="Fisher":
		print "running Fisher's exact test"
		t1 = time.time()
		true_pvals = run_fet(snps,phenVals)
		t2 = time.time()
		print "Took",t2-t1,"seconds."
		

	
	
	perm_pvalues_list = []
	for i in range(0,numPerm):#For every perm
		if filter <1.0:
			snps = random.sample(all_snps,int(len(all_snps)*filter))
			print "Number of SNPs:",len(snps)	
		print i
		random.shuffle(phenVals) #Permute phenotype
		#pvals = analyzeHaplotype._run_kw_(snps,phenVals)	#Run KW
		if test_type=="KW":
			print "running KW"
			t1 = time.time()
			pvals = util.kruskal_wallis(snps,phenVals)["ps"]
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		elif test_type=="Fisher":
			print "running Fisher's exact test"
			t1 = time.time()
			pvals = run_fet(snps,phenVals)
			t2 = time.time()
			print "Took",t2-t1,"seconds."
		perm_pvalues_list.append(pvals)

	
	print "Combining p-values"
	quantiles = []
	all_pvals = []
	for pvals in perm_pvalues_list:
		for pval in pvals:
			all_pvals.append(pval)
	print len(all_pvals),"permuted pvals in all"
	quantiles = analyzePhenotype._getQuantiles_(all_pvals, 1000)
	print "len(quantiles):", len(quantiles)
	exp_median = (quantiles[499]+quantiles[500])/2.0

	(true_m,true_a,true_ks_stat,true_ks_pvalue,true_s) = _calc_statistics_(true_pvals,quantiles,exp_median,all_pvals)

	m_list = []
	a_list = []
	ks_stat_list = []
	ks_pvalue_list = []
	s_list = []
	for i in range(0,numPerm):
		pvals = perm_pvalues_list[i]
		(m,a,ks_stat,ks_pvalue,s) = _calc_statistics_(pvals,quantiles,exp_median,all_pvals) #Calc. statistic
		m_list.append(m)
		a_list.append(a)
		s_list.append(s)
		ks_stat_list.append(ks_stat)
		ks_pvalue_list.append(ks_pvalue)
	
	del all_pvals,quantiles

		
	if savePermutations:
		permOutputFile = outputFile+".perm.pvals"
		print "Writing to",permOutputFile
		f = open(permOutputFile,"w")
		i = 0
		for pvals in perm_pvalues_list:
			pvals_str = map(str,pvals)
			f.write(",".join(pvals_str)+"\n")
		print "Done writing to",permOutputFile
	
	f.close()


	#Output results
	outputFile = outputFile+".perm.stat.txt"
	f = open(outputFile,"w")
	f.write("Perm_nr, median, area, ks_stat, s_stat \n")
	for i in range(0,numPerm):
		str_l = map(str,[i, m_list[i],a_list[i],ks_stat_list[i],s_list[i]])
		f.write(", ".join(str_l)+"\n")
	
	f.write("\n"+"Observed values: "+str((true_m,true_a,true_ks_stat,true_s))+"\n")

	pvals = [0.0,0.0,0.0,0.0]
	
	#M stat p-value (two sided)
	#Assuming symm. dist.
	for i in range(0,numPerm):
		if abs(true_m) <= abs(m_list[i]):
			pvals[0]+=1.0/numPerm
	
	#A stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_a <= a_list[i]:
			pvals[1]+=1.0/numPerm
		

	#KS stat p-value (one tailed)
	for i in range(0,numPerm):
		if true_ks_stat <= ks_stat_list[i]:
			pvals[2]+=1.0/numPerm
		
	#S stat p-value (one tailed)
	for i in range(0,numPerm):
		if abs(math.log(true_s+1.0)) <= abs(math.log(s_list[i]+1.0)):
			pvals[3]+=1.0/numPerm
		


	for i in range(0,len(pvals)):
		if pvals[i] == 0.0:
			pvals[i] = 0.5*(1.0/numPerm)
			
	str_pvals = map(str,pvals)
	f.write("\n"+"Estimated p-values: "+",".join(str_pvals)+"\n")
	f.close()
	
	#Plot results
	pngFile_median = outputFile+".perm.m.png"
	pngFile_area = outputFile+".perm.a.png"
	pngFile_ks = outputFile+".perm.ks.png"
	pngFile_s = outputFile+".perm.s.png"
	
	def _getBinning_(n_bins,min_val,max_val):
		bins = []
		delta = (max_val-min_val)/n_bins
		start_val = min_val-delta*0.5
		for i in range(0,n_bins+2):
			bins.append(start_val+delta*i)
		return (bins,delta)
		
	n_bins = 20+int(4*(math.log(numPerm)))
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	min_val = min(min(m_list),true_m)
	max_val = max(max(m_list),true_m)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)

	plt.figure(figsize=(10,7))
	plt.hist(m_list+[true_m], bins = bins)#, range=[start_val,end_val])
	plt.hist([true_m], bins = bins)#, range=[start_val,end_val])
	plt.savefig(pngFile_median, format = "png")
	plt.legend()
	plt.clf()

	min_val = min(min(a_list),true_a)
	max_val = max(max(a_list),true_a)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(a_list+[true_a], bins = bins)
	plt.hist([true_a], bins = bins)
	plt.savefig(pngFile_area, format = "png")
	plt.clf()

	min_val = min(min(ks_stat_list),true_ks_stat)
	max_val = max(max(ks_stat_list),true_ks_stat)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(ks_stat_list+[true_ks_stat], bins = bins)
	plt.hist([true_ks_stat], bins = bins)
	plt.savefig(pngFile_ks, format = "png")
	plt.clf()

	min_val = min(min(s_list),true_s)
	max_val = max(max(s_list),true_s)
	(bins,delta) = _getBinning_(n_bins,min_val,max_val)
	print (bins,delta)
	plt.figure(figsize=(10,7))
	plt.hist(s_list+[true_s], bins = bins)
	plt.hist([true_s], bins = bins)
	plt.savefig(pngFile_s, format = "png")
	plt.clf()