def getKinshipMatrix(): #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" snpsDataFile = "/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv" import dataParsers, snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) snps = [] sys.stdout.write("Converting format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snps += snpsd.getSnpsData(missingVal="NA").snps print "" #snps = _sampleSNPs_(snps,100) print "Calculating kinship" K = calcKinship(snps) eDict = phenotypeData._getEcotypeIdToStockParentDict_() accessions = map(int, snpsd.accessions) #for et in accessions: #print eDict[et] for i in range(0, len(accessions)): et = accessions[i] info = eDict[et] st = str(et) + ", " + str(info[0]) + ", " + str(info[1]) + ":" st += str(K[i][0]) for j in range(1, i + 1): st += ", " + str(K[i][j]) print st
def getKinshipMatrix(): #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" snpsDataFile="/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv" import dataParsers,snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True) snps = [] sys.stdout.write("Converting format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snps += snpsd.getSnpsData(missingVal="NA").snps print "" #snps = _sampleSNPs_(snps,100) print "Calculating kinship" K = calcKinship(snps) eDict = phenotypeData._getEcotypeIdToStockParentDict_() accessions = map(int,snpsd.accessions) #for et in accessions: #print eDict[et] for i in range(0,len(accessions)): et = accessions[i] info = eDict[et] st = str(et)+", "+str(info[0])+", "+str(info[1])+":" st += str(K[i][0]) for j in range(1,i+1): st += ", "+str(K[i][j]) print st
def plot_250k_Tree(chr=None, startPos=None, endPos=None): import scipy as sp import scipy.cluster.hierarchy as hc import Emma import pylab import phenotypeData e_dict = phenotypeData._getEcotypeIdToStockParentDict_() snpsds = dataParsers.parseCSVData( "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv") snps = [] for snpsd in snpsds: snps += snpsd.getSnpsData().snps snps = sampleSNPs(snps, 100000, False) labels = [] for acc in snpsds[0].accessions: try: s = unicode(e_dict[int(acc, )][0], 'iso-8859-1') except Exception, err_s: print err_s print e_dict[int(acc)][0] s = acc labels.append(s)
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps # globalKinship = calcKinship(totalSNPs) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. # chr = 1 # for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals( snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW" ) ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] # print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps #globalKinship = calcKinship(totalSNPs) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. #chr = 1 #for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals(snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW") ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] #print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()