예제 #1
0
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  # len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        # pylab.subplot(5,1,chr)
        # 		pylab.figure(figsize=(18,4))
        # 		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        # 		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        # 		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        # 		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        # 		pylab.legend(numpoints=2,handlelen=0.005)
        # 		pylab.title("Kinship diff. chr. "+str(chr))
        # 		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        # 		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$")
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$")
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf")
        pylab.clf()
        gc.collect()  # Calling garbage collector, in an attempt to clean up memory..
예제 #2
0
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    # globalKinship = calcKinship(totalSNPs)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    # chr = 1
    # for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)")

    (pvals, new_positions, acc_groups) = get_KW_pvals(
        snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW"
    )
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            # print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    #globalKinship = calcKinship(totalSNPs)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    #chr = 1
    #for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "c.",
               label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "g.",
               label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "r.",
               label="KW (full data)")

    (pvals, new_positions,
     acc_groups) = get_KW_pvals(snpsd.snps[200:1400],
                                snpsd.positions[200:1400],
                                phed,
                                p_i,
                                kinshipThreshold=0.95,
                                method="KW")
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            #print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  #len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        #pylab.subplot(5,1,chr)
        #		pylab.figure(figsize=(18,4))
        #		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        #		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        #		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        #		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        #		pylab.legend(numpoints=2,handlelen=0.005)
        #		pylab.title("Kinship diff. chr. "+str(chr))
        #		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        #		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$')
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$')
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" +
                      str(chr) + ".pdf",
                      format="pdf")
        pylab.clf()
        gc.collect(
        )  #Calling garbage collector, in an attempt to clean up memory..