def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): # len(snpsds)): chr = i + 1 snpsd = snpsds[i] # pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$") pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$") pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect() # Calling garbage collector, in an attempt to clean up memory..
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps # globalKinship = calcKinship(totalSNPs) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. # chr = 1 # for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals( snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW" ) ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] # print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps #globalKinship = calcKinship(totalSNPs) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. #chr = 1 #for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals(snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW") ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] #print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): #len(snpsds)): chr = i + 1 snpsd = snpsds[i] #pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$') pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$') pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory..