Пример #1
0
def plot_flc_haplotype():
    import analyzeHaplotype as ah
    #res = readFastaFile("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/FLC_muscle_072109.aln")
    #seqs = res["sequences"]
    #seq_names = res["names"]
    (positions, aln_snps, seq_names) = getSNPsFromSequences(
        "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/FLC_muscle_072109.aln",
        ref_start_pos=3170001,
        ref_seq="cut_3184000_3170000",
        reversed=True)
    #aln_snps = map(list,zip(*seqs))
    #seqs = reverse_sequences(seqs)
    i = seq_names.index("cut_3184000_3170000")
    seq_names[i] = "Col_TAIR8"
    (flc_250k_snps, flc_snps, flc_250K_positions, accessions,
     flc_data_acc_map) = get_overlapping_snps_in_region()
    print flc_data_acc_map
    import phenotypeData as pd
    a_dict = pd._getEcotypeIdInfoDict_()
    new_accessions = []
    for acc in accessions:
        new_accessions.append(unicode(a_dict[int(acc)][0], 'iso-8859-1'))
    accessions = new_accessions
    ah.plot_flc_haplotypes(
        aln_snps,
        positions=positions,
        accessions=seq_names,
        haplotypeFile="/Users/bjarnivilhjalmsson/tmp/aln_haplotype.pdf",
        treeFile="/Users/bjarnivilhjalmsson/tmp/aln_tree.pdf",
        acc_250k=flc_data_acc_map,
        flc_250K_positions=flc_250K_positions)
Пример #2
0
def _get192Ecotypes_():
	resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	
	
	total_accessions = set()
	for p_i in phenotypeIndices:
		if not p_i in [5,6,7]:
			accessions = phed.getAccessionsWithValues(p_i)
			total_accessions = total_accessions.union(accessions)

	ecotypes_192 = phenotypeData._getFirst192Ecotypes_()
	ecotypes_192 = [str(e) for e in ecotypes_192]
	print "len(ecotypes_192):",len(ecotypes_192)
	#print ecotypes_192
	phed.filterAccessions(ecotypes_192)

        for p_i in [5,6,7]:
		accessions = phed.getAccessionsWithValues(p_i)
		total_accessions = total_accessions.union(accessions)
		
	total_accessions = list(total_accessions)
	print len(total_accessions)
	total_accessions.sort()
	print total_accessions
	
	ecotype_info_dict = phenotypeData._getEcotypeIdInfoDict_()
	ets = []
	
	i = 0
	for et in total_accessions:
		et = int(et)
		if ecotype_info_dict.has_key(et):
			print str(et)+", "+str(ecotype_info_dict[et][0])+", "+str(ecotype_info_dict[et][1])
			i += 1
			ets.append(et)
		else:
			print et,"is missing in genotype data."
	print i
	return ets
Пример #3
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
Пример #4
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Пример #5
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=",
						"logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", 
						"kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", 
						"complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness",
						"permutationFilter="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	phenotypeRanks = False
	removeOutliers = None
	addConstant = -1
	phenotypeFileType = 1
	rFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	minMAF=0.0
	withArrayIds = 1
	boundaries = [-1,-1]
	chr=None
	parallel = None
	logTransform = False
	negate = False
	parallelAll = False
	lrt = False
	kinshipDatafile = None 
	onlyMissing = False
	onlyOriginal96 = False
	onlyOriginal192 = False
	onlyBelowLatidue = None
	complement = False

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	testRobustness = False
	permutationFilter = 0.002

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o","--rFile"):
			rFile = arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--addConstant"):
			addConstant = float(arg)
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("--onlyMissing"):
			onlyMissing = True
		elif opt in ("--onlyOriginal96"):
			onlyOriginal96 = True
		elif opt in ("--onlyOriginal192"):
			onlyOriginal192 = True
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue = float(arg)
		elif opt in ("--complement"):
			complement = True
		elif opt in ("--logTransform"):
			logTransform = True
		elif opt in ("--negate"):
			negate = True
		elif opt in ("--removeOutliers"):
			removeOutliers = float(arg)
		elif opt in ("--LRT"):
			lrt = True
		elif opt in ("-c","--chr"):
			chr = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--kinshipDatafile"):
			kinshipDatafile = arg
		elif opt in ("--phenotypeRanks"):
			phenotypeRanks = True
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	print "Emma is being set up with the following parameters:"
	print "output:",rFile
	print "phenotypeRanks:",phenotypeRanks
	print "withArrayId:",withArrayIds
	print "phenotypeFileType:",phenotypeFileType
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "minMAF:",minMAF
	print "LRT:",lrt
	print "delim:",delim
	print "missingval:",missingVal
	print "kinshipDatafile:",kinshipDatafile
	print "chr:",chr
	print "boundaries:",boundaries
	print "onlyMissing:",onlyMissing
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "complement:",complement
	print "negate:",negate
	print "logTransform:",logTransform
	print "addConstant:",addConstant
	print "removeOutliers:",removeOutliers
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "testRobustness:",testRobustness
	print "permutationFilter:",permutationFilter


	def runParallel(phenotypeIndex,phed):
		#Cluster specific parameters
		print phenotypeIndex
		phenName = phed.getPhenotypeName(phenotypeIndex)
		outFileName = resultDir+"Emma_"+parallel+"_"+phenName

		shstr = """#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=8g 
#PBS -q cmb
"""

		shstr += "#PBS -N E"+phenName+"_"+parallel+"\n"
		shstr += "set phenotypeName="+parallel+"\n"
		shstr += "set phenotype="+str(phenotypeIndex)+"\n"
		shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" "
		if onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		if logTransform:
			shstr += " --logTransform "
		if negate:
			shstr += " --negate "
		if removeOutliers:
			shstr += " --removeOutliers="+str(removeOutliers)+" "
		if phenotypeRanks:
			shstr += " --phenotypeRanks "
		if testRobustness:
			shstr+=" --testRobustness "

		shstr+=" --permutationFilter="+str(permutationFilter)+" "

		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "
			
		shstr += " -a "+str(withArrayIds)+" "			
		if kinshipDatafile:
			shstr += " --kinshipDatafile="+str(kinshipDatafile)+" "			
		shstr += " --addConstant="+str(addConstant)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		if parallelAll:
			for phenotypeIndex in phed.phenIds:
				if onlyMissing:
					phenName = phed.getPhenotypeName(phenotypeIndex)
					pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
					res = None
					try:
						res = os.stat(pvalFile)

					except Exception:
						print "File",pvalFile,"does not exist."
					if res and res.st_size>0:
						print "File",pvalFile,"already exists, and is non-empty."
						if sr:
							srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"
							srRes = None
							try:
								srRes = os.stat(srInput)
							except Exception:
								print "File",srInput,"does not exist."
							if srRes and srRes.st_size>0:
								print "File",srInput,"already exists, and is non-empty."
							else:
								runParallel(phenotypeIndex,phed)
							
					else:
						print "Setting up the run."
						runParallel(phenotypeIndex,phed)
											
				else:
					runParallel(phenotypeIndex,phed)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex,phed)
		return
	else:
		phenotypeIndex = int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "\nStarting program now!\n"



	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

	#Load phenotype file
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	numAcc = len(snpsds[0].accessions)

	#Removing outliers
	if removeOutliers:
		print "Remoing outliers"
		phed.naOutliers(phenotypeIndex,removeOutliers)
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()

	phenotype = phed.getPhenIndex(phenotypeIndex)

	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	#Checking which accessions to keep and which to remove .
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	

	print "\nFiltering accessions in genotype data:"
	#Filter accessions which do not have the phenotype value (from the genotype data).
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all."
		

	print "\nNow filtering accessions in phenotype data:"
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values

	print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions)
	if len(phed.accessions)!=len(snpsds[0].accessions):
		raise Exception

	#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)

	#Removing SNPs which are outside of boundaries.
	if chr:
		print "\nRemoving SNPs which are outside of boundaries."
		snpsds[chr-1].filterRegion(boundaries[0],boundaries[1])
		snpsds = [snpsds[chr-1]]
	
	#Ordering accessions in genotype data to fit phenotype data.
	print "Ordering genotype data accessions."
	accessionMapping = []
	i = 0
	for acc in phed.accessions:
		if acc in snpsds[0].accessions:
			accessionMapping.append((snpsds[0].accessions.index(acc),i))
			i += 1

	#print zip(accessionMapping,snpsds[0].accessions)
	print "len(snpsds[0].snps)",len(snpsds[0].snps)

	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.orderAccessions(accessionMapping)
	print "\nGenotype data has been ordered."
		
	#Converting format to 01
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
	print ""


	
	print "Checking kinshipfile:",kinshipDatafile
	
	if kinshipDatafile:  #Is there a special kinship file?
		kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

		accIndicesToKeep = []			
		#Checking which accessions to keep and which to remove (genotype data).
		sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
		sys.stdout.flush()
		for i in range(0,len(kinshipSnpsds[0].accessions)):
			acc1 = kinshipSnpsds[0].accessions[i]
			for j in range(0,len(phed.accessions)):
				acc2 = phed.accessions[j]
				if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
					accIndicesToKeep.append(i)
					break	
		print accIndicesToKeep
	
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all."
	
		print "Ordering kinship data accessions."
		accessionMapping = []
		i = 0
		for acc in snpsds[0].accessions:
			if acc in kinshipSnpsds[0].accessions:
				accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i))
				i += 1

		print zip(accessionMapping,snpsds[0].accessions)
		print "len(snpsds[0].snps)",len(snpsds[0].snps)
		
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.orderAccessions(accessionMapping)
		print "Kinship genotype data has been ordered."

		newKinshipSnpsds = []
		sys.stdout.write("Converting data format")
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))  #This data might have NAs
		print ""
		kinshipSnpsds = newKinshipSnpsds

	else:
		kinshipSnpsds = newSnpsds
		

	print "Found kinship data."

	#Ordering accessions according to the order of accessions in the genotype file
#	accessionMapping = []
#	i = 0
#	for acc in snpsds[0].accessions:
#		if acc in phed.accessions:
#			accessionMapping.append((phed.accessions.index(acc),i))
#			i += 1
#	phed.orderAccessions(accessionMapping)

	
	#Negating phenotypic values
	if negate: 
		phed.negateValues(phenotypeIndex)

	#Adding a constant.
	if addConstant!=-1:
		if addConstant==0:
			addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10
			addConstant = addConstant - phed.getMinValue(phenotypeIndex)
			
		print "Adding a constant to phenotype:",addConstant
		phed.addConstant(phenotypeIndex,addConstant)
	
		
	
	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotypeIndex)
	#Converting phenotypes to Ranks
	elif phenotypeRanks:
		phed.transformToRanks(phenotypeIndex)
	
	if not chr:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5])
	else:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr])
		
	
	phenotypeName = phed.getPhenotypeName(phenotypeIndex)

	sys.stdout.flush()
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in snpsDataset.snpsDataList:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		_robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter)
		sys.exit(0)

	if (not sr) or (sr and not srSkipFirstRun):
		sys.stdout.write("Running Primary Emma.\n")
		sys.stdout.flush()
		pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt)
		res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.filterMAF()
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Пример #6
0
def _run_():
    l_192 = [
        6909, 6977, 100000, 6906, 8266, 6897, 6898, 5837, 6907, 7438, 6910,
        6913, 6914, 6918, 6919, 8214, 6924, 8424, 6926, 6928, 6933, 7520, 7521,
        6936, 7522, 6937, 6939, 6900, 6901, 6908, 6009, 6915, 6917, 6920, 6922,
        6923, 6927, 6929, 6930, 6940, 6942, 6943, 7518, 6946, 8213, 6951, 6958,
        6959, 7525, 6961, 6967, 6973, 6974, 6976, 7516, 6979, 6980, 6982, 6983,
        6985, 6931, 6043, 6945, 7519, 7526, 7523, 6956, 6960, 7524, 6963, 6964,
        6965, 6966, 6969, 6971, 6975, 7517, 6978, 6981, 6984, 6899, 6903, 6904,
        6905, 6911, 6916, 8215, 6921, 6932, 6046, 6944, 7515, 7514, 6962, 6968,
        6972, 6970, 8329, 7163, 8258, 8259, 8290, 7461, 7323, 8254, 8270, 8233,
        8285, 6016, 8423, 8237, 6040, 6064, 6957, 8369, 8247, 8426, 9058, 8249,
        9057, 6709, 7000, 7062, 7460, 7123, 7147, 7255, 7275, 8241, 6988, 8256,
        8264, 8265, 8231, 8271, 8274, 8275, 8420, 8283, 8284, 6008, 8422, 8296,
        8297, 8300, 8235, 8306, 8310, 8236, 8311, 8314, 8239, 8240, 8323, 8242,
        8325, 8326, 8222, 8430, 6042, 8335, 8343, 6074, 8351, 8353, 8354, 7296,
        8365, 8374, 8376, 8378, 8412, 8387, 8389, 6243, 7306, 7418, 8312, 8313,
        8334, 8337, 8357, 8366, 8411, 8388, 8395, 7014, 7081, 8243, 8245, 7033,
        7064, 7094, 7424, 7231, 7282, 7477, 7346, 8230
    ]
    print len(l_192)

    acc_info = _read_seeds_files_()
    et_list = acc_info["ecotypes"]
    spi = pd._get_stock_parent_info_dict_()
    ei_dict = pd._getEcotypeIdInfoDict_()
    usc_ecotypes = _read_seeds_files_(
        file_names=["USC_seeds_info_from Joy.csv"],
        file_dir="/Users/bjarnivilhjalmsson/Projects/Solexa_sequencing/"
    )["ecotypes"]
    diff_set = set(et_list).difference(
        set(et_list).intersection(set(usc_ecotypes)))
    for e in diff_set:
        print ei_dict[e]
    ms_acc_info = _read_MPI_SALK_sequences_()
    ms_accessions = ms_acc_info["accessions"]
    ms_stock_parents = ms_acc_info["stock_parents"]
    print len(spi)
    a2e = pd._getAccessionToEcotypeIdDict_(ms_accessions, )
    len(a2e)
    ecotypes = []
    for acc, sp in zip(ms_accessions, ms_stock_parents):
        if acc in a2e:
            ecotypes.append(int(a2e[acc]))
        elif sp in spi:
            ecotypes.append(int(spi[sp][0]))
        else:
            ecotypes.append(None)
            print acc, ",", sp, ": weren't found!!"
    print len(ecotypes) - ecotypes.count(None), len(ecotypes)
    tg_e_dict = pd._getEcotype2TgEcotypeDict_()
    ms_e_set = set(ecotypes)
    print len(ms_e_set)
    elist = []
    for e in et_list:
        if e:
            elist.append(tg_e_dict[e])
        else:
            elist.append(None)

    e_set = set(elist)
    i_set = ms_e_set.intersection(e_set)
    print e_set
    print ms_e_set
    print i_set
    print ei_dict[list(i_set)[0]]

    ecotypes_192 = set(l_192)
    i_set1 = ecotypes_192.intersection(ms_e_set)
    i_set2 = ecotypes_192.intersection(e_set)
    s = ecotypes_192.intersection(ms_e_set.union(e_set))
    ds = ecotypes_192.difference(s)
    print len(i_set1), len(i_set2), len(s), len(ds)
    for e in ds:
        print ei_dict[e]

    f = open("/tmp/missing_gwas.csv", "w")
    f.write("ecotype_id,accession_name,stock_parent,country_of_origin\n")
    for e in ds:
        f.write(
            str(e) + "," + ei_dict[e][0] + "," + ei_dict[e][1] + "," +
            ei_dict[e][4] + "\n")
    f.close()
Пример #7
0
def load_phentoype_file_bergelsson():
	import env
	filename = "/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/bergelsson_rosette_glucs.csv"
	f = open(filename, "r")
	reader = csv.reader(f)
	phenotype_names = reader.next()[2:]
	for i in range(len(phenotype_names)):
		phenotype_names[i] = phenotype_names[i].replace(" ", "_")
		phenotype_names[i] = 'jb_' + phenotype_names[i]
	print phenotype_names
	accession_names = []
	accession_ID = []
	for row in reader:
		accession_names.append(row[0].split()[0].lower())
		accession_ID.append(row[1])
	f.close()
	print accession_names
	#acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"])
	e_info_dict = pd._getEcotypeIdInfoDict_()
	ei_2_tgei = pd._getEcotype2TgEcotypeDict_()
	#print len(acc_dict),acc_dict
	ecotypes = []
        uncertain_list = []
	for acc, acc_id in zip(accession_names, accession_ID):
		#if not acc in acc_dict:
		if not int(acc_id) in ei_2_tgei:
			print "(%s, %s) is missing in dictionary" % (acc, acc_id)
			a_id = int(acc_id)
			if a_id in e_info_dict:
				e_info = e_info_dict[a_id]
				print "Guessing that it's:", e_info
			else:
				print "No good guess for it.  Look it up!!\n"
			#acc_dict[acc] = acc_id
			ecotypes.append(acc_id)
		else:
			#ecotype = acc_dict[acc]
			ecotype = ei_2_tgei[int(acc_id)]
			ecotypes.append(ecotype)
	phenotype_indices = range(2, len(phenotype_names) + 2)
	phenotypes = []	#[acc_id][phenotype_name]
	f = open(filename, "r")
	reader = csv.reader(f)
	reader.next()

	print len(set(accession_ID)), len(set(ecotypes))

	for row in reader:
		#print row
		#if row[0].split()[0].lower() in acc_dict:
			phen_vals = []
			for pv in row[2:]:
				if pv == "" or pv == 'NA':
					pv = 'NA'
				else:
					pv = float(pv)
				phen_vals.append(pv)
			if len(phen_vals) != len(phenotype_names):
				import pdb;
				pdb.set_trace()
			phenotypes.append(phen_vals)
		#else:
		#	print "Missing:",row[0]


	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.tsv", delimiter='\t')
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.csv", delimiter=',')
def plot_local_haplotypes(filename,
                          marker_data,
                          focal_start,
                          focal_end,
                          error_tolerance=0,
                          phenotypeData=None,
                          phen_id=None):
    """
	Plots certain types of haplotype plots...
	"""
    haplotype_ids = range(1000, 0, -1)
    #Fill color matrix up..

    start_i = 0
    cur_pos = 0
    while start_i < len(marker_data.positions) and cur_pos < focal_start:
        cur_pos = marker_data.positions[start_i]
        start_i += 1
    if start_i == len(marker_data.positions):
        raise Exception("Region is not covered by markers.")
    end_i = start_i
    while end_i < len(marker_data.positions) and cur_pos < focal_end:
        cur_pos = marker_data.positions[end_i]
        end_i += 1

    center_haplotypes = []
    for a_i in range(len(marker_data.accessions)):
        haplotype = []
        for snp in marker_data.snps[start_i:end_i]:
            haplotype.append(snp[a_i])
        center_haplotypes.append(haplotype)
    haplotype_dict = {}
    hap_pos = (marker_data.positions[end_i - 1] +
               marker_data.positions[start_i]) / 2
    for a_i, c_h in enumerate(center_haplotypes):
        ch = tuple(c_h)
        if not ch in haplotype_dict:
            haplotype_dict[ch] = [
                0, 0, [a_i], hap_pos
            ]  #haplotype id number, haplotype frequency count, list of accessions indices, and position.
        else:
            haplotype_dict[ch][2].append(a_i)

    freq_hapl_list = []
    for ch in haplotype_dict:
        hi = haplotype_dict[ch]
        haplotype_dict[ch][1] = len(hi[2])
        freq_hapl_list.append((len(hi[2]), ch))
    freq_hapl_list.sort(reverse=True)
    for (hc, haplotype) in freq_hapl_list:
        if hc == 1:
            haplotype_dict[haplotype][0] = 0
        else:
            haplotype_dict[haplotype][0] = haplotype_ids.pop()

    center_haplotype_dict = haplotype_dict
    left_haplotypes = []
    right_haplotypes = []
    left_haplotypes.append(center_haplotype_dict)
    right_haplotypes = []
    left_positions = [hap_pos]
    right_positions = []

    #Starting with the haplotype structure to the left!
    some_haplotype = True
    i = start_i - 1
    old_hap_dict = center_haplotype_dict
    while old_hap_dict and i >= 0:
        #print i
        #l1  = [len(old_hap_dict[h][2]) for h in old_hap_dict]
        #l2  = [old_hap_dict[h][0] for h in old_hap_dict]
        #print l1,l2, sum(l1)
        haplotype_dict = {}
        hap_pos = marker_data.positions[i]
        left_positions.append(hap_pos)
        for hap in old_hap_dict:
            (h_id, h_count, acc_indices,
             pos) = old_hap_dict[hap]  #info on the old haplotype
            #print h_id

            temp_hap_dict = {}
            for a_i in acc_indices:
                new_hap = tuple([marker_data.snps[i][a_i]] + list(hap))
                if not new_hap in temp_hap_dict:
                    temp_hap_dict[new_hap] = [
                        0, 0, [a_i], hap_pos
                    ]  #haplotype id number, haplotype frequency count, list of accessions indices, and position.
                else:
                    temp_hap_dict[new_hap][2].append(a_i)

            freq_hapl_list = []
            for h in temp_hap_dict:
                hi = temp_hap_dict[h]
                temp_hap_dict[h][1] = len(hi[2])
                freq_hapl_list.append((len(hi[2]), h))
            freq_hapl_list.sort()

            #print freq_hapl_list
            (hc, h) = freq_hapl_list.pop(
            )  #the most frequent haplotype gets colored like the last one.
            if hc == 1:
                del temp_hap_dict[h]
            else:
                temp_hap_dict[h][0] = h_id

            freq_hapl_list.reverse()
            for (hc, h) in freq_hapl_list:
                if hc == 1:
                    del temp_hap_dict[h]
                else:
                    temp_hap_dict[h][0] = haplotype_ids.pop()
            for h in temp_hap_dict:
                haplotype_dict[h] = temp_hap_dict[h]

        if haplotype_dict:
            left_haplotypes.append(haplotype_dict)
        old_hap_dict = haplotype_dict
        i -= 1

    #Now the same with the haplotype structure to the right!
    i = end_i
    old_hap_dict = center_haplotype_dict
    while old_hap_dict and i < len(marker_data.snps):
        #print i
        #l1  = [len(old_hap_dict[h][2]) for h in old_hap_dict]
        #l2  = [old_hap_dict[h][0] for h in old_hap_dict]
        #print l1,l2, sum(l1)
        haplotype_dict = {}
        hap_pos = marker_data.positions[i]
        right_positions.append(hap_pos)
        for hap in old_hap_dict:
            (h_id, h_count, acc_indices,
             pos) = old_hap_dict[hap]  #info on the old haplotype

            temp_hap_dict = {}
            for a_i in acc_indices:
                nt = marker_data.snps[i][a_i]
                new_hap = list(hap)
                new_hap.append(nt)
                new_hap = tuple(new_hap)
                #print new_hap
                if not new_hap in temp_hap_dict:
                    temp_hap_dict[new_hap] = [
                        0, 0, [a_i], hap_pos
                    ]  #haplotype id number, haplotype frequency count, list of accessions indices, and position.
                else:
                    temp_hap_dict[new_hap][2].append(a_i)

            freq_hapl_list = []
            for h in temp_hap_dict:
                hi = temp_hap_dict[h]
                temp_hap_dict[h][1] = len(hi[2])
                freq_hapl_list.append((len(hi[2]), h))

            freq_hapl_list.sort()
            (hc, h) = freq_hapl_list.pop(
            )  #the most frequent haplotype gets colored like the last one.
            if hc == 1:
                del temp_hap_dict[h]
            else:
                temp_hap_dict[h][0] = h_id

            freq_hapl_list.reverse()
            for (hc, h) in freq_hapl_list:
                if hc == 1:
                    del temp_hap_dict[h]
                else:
                    temp_hap_dict[h][0] = haplotype_ids.pop()
            for h in temp_hap_dict:
                haplotype_dict[h] = temp_hap_dict[h]

        if haplotype_dict:
            right_haplotypes.append(haplotype_dict)
        old_hap_dict = haplotype_dict
        i += 1

    #Clustering...
    dm = calc_local_dist(marker_data,
                         focal_start,
                         focal_end,
                         error_tolerance=error_tolerance)
    print dm
    import scipy as sp
    import scipy.cluster.hierarchy as hc
    Z = hc.average(dm)  #Performing clustering using the dist. matr.
    print Z
    import pylab
    dend_dict = hc.dendrogram(Z, labels=marker_data.accessions)
    new_acc_order = dend_dict['ivl']
    print new_acc_order
    ai_map = [new_acc_order.index(acc) for acc in marker_data.accessions]

    import numpy as np
    #Updating the positions in the figure.
    left_positions.reverse()
    positions = left_positions + right_positions
    x_s = np.zeros((len(positions) + 1, len(marker_data.accessions) + 1))
    start_pos = positions[0] - (0.5 * (positions[1] - positions[0]))
    print len(x_s), len(x_s[0, ])
    for j in range(0, len(x_s[0, ])):
        x_s[0, j] = start_pos
    for j in range(1, len(x_s) - 1):  # number of SNPs
        x = positions[j - 1] + 0.5 * (positions[j] - positions[j - 1])
        for k in range(0, len(x_s[j, ])):  # number of NTs
            x_s[j, k] = x
    for j in range(0, len(x_s[0, ])):
        x_s[-1, j] = positions[-1] + (0.5 * (positions[-1] - positions[-2]))

    y_s = np.zeros((len(positions) + 1, len(marker_data.accessions) + 1))
    for j in range(0, len(y_s)):  # number of SNPs
        for k in range(0, len(y_s[j, ])):  # number of NTs
            y_s[j, k] = k - 0.5

    #Updating the colors in the figure.
    color_matrix = np.ones((len(positions), len(marker_data.accessions)))
    left_haplotypes.reverse()
    haplotypes = left_haplotypes + right_haplotypes

    max_color = float(haplotype_ids.pop())
    for i, hap_dict in enumerate(haplotypes):
        for h in hap_dict:
            (h_id, h_count, acc_indices, pos) = hap_dict[h]
            for a_i in acc_indices:
                m_ai = ai_map[a_i]
                if h_id == 0:
                    color_matrix[i, m_ai] = 1.0
                else:
                    color_matrix[i, m_ai] = h_id / max_color

    import phenotypeData as pd
    e_dict = pd._getEcotypeIdInfoDict_()
    accessions = [
        unicode(e_dict[int(e)][0], 'iso-8859-1') for e in new_acc_order
    ]
    #Plot figure..
    import pylab
    pylab.figure(figsize=(18, 8))
    pylab.axes([0.08, 0.06, 0.9, 0.88])
    pylab.pcolor(x_s, y_s, color_matrix, cmap=pylab.cm.hot)
    #Dealing with the phenotype data
    phenotypeData.removeAccessionsNotInSNPsData(marker_data)
    et_mapping = []
    for i, et in enumerate(new_acc_order):
        et_mapping.append((marker_data.accessions.index(et), i))
    phenotypeData.orderAccessions(et_mapping)
    phen_vals = phenotypeData.getPhenVals(phen_id, noNAs=False)
    acc_strings1 = [
        accessions[i] + ", " + str(phen_vals[i])
        for i in range(len(accessions))
    ]
    acc_strings = [
        accessions[i] + ", " + str(phen_vals[i])
        for i in range(len(accessions))
    ]

    pylab.yticks(range(0, len(marker_data.accessions)),
                 acc_strings,
                 size="small")
    x_range = (x_s[-1, 0] - x_s[0, 0])

    #Retreiving and drawing the genes
    import regionPlotter as rp
    import gwaResults as gr
    genes = gr.get_gene_list(start_pos=x_s[0, 0], end_pos=x_s[-1, 0], chr=5)
    rp.drawGenes(genes, y_shift=-3, rangeVal=40)

    pylab.axis((x_s[0, 0] - 0.05 * x_range, x_s[-1, 0] + 0.05 * x_range,
                -0.1 * len(marker_data.accessions) - 1,
                1.02 * len(marker_data.accessions)))

    pylab.savefig(filename, format='pdf')
Пример #9
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness"]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
		phenotypeFileType=1
		outputFile=None
	delim=","
	missingVal="NA"
	help=0
	withArrayIds=1
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-a", "--withArrayId"):
			withArrayIds=int(arg)
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		if env.user=="bjarni":
			tempfile.tempdir='/tmp'
		(fId, phenotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		(fId, genotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		
		phed.writeToFile(phenotypeTempFile, [phenotype])	
		sys.stdout.write("Phenotype file written\n")
		sys.stdout.flush()
		snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		decoder={1:1, 0:0,-1:'NA'}	
		snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder)
		sys.stdout.write("Genotype file written\n")
		sys.stdout.flush()
	
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
	
		rDataFile=outputFile+".rData"
		pvalFile=outputFile+".pvals"
		#Is the phenotype binary?
		binary=phed.isBinary(phenotypeIndex)
		rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary)
		rFileName=outputFile+".r"
		f=open(rFileName, 'w')
		f.write(rstr)
		f.close()
		outRfile=rFileName+".out"
		errRfile=rFileName+".err"
		print "Running R file:"
		cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()	
		gc.collect() 
		os.system(cmdStr)
		#print "Emma output saved in R format in", rDataFile
		print "Generating a GW plot."
		res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)