def raxTree(seqBlock, indNames, model, raxml, outgroup = None, uniqueTag = "", test = False, log="/dev/null"):
    #write file
    tempAln = open("temp." + uniqueTag + ".phy", "w")
    tempAln.write(genomics.makeAlnString(indNames,seqBlock))
    tempAln.close()
    if outgroup is not None:
        og = " -o " + ",".join(outgroup)
    else:
        og = ""
    #raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent"
    raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent >>" + log
    if test: print >> sys.stderr, "raxml command:\n", raxCommand
    os.system(raxCommand)
    tempAln.close()
    #try retrieve the result  
    try:
        treeFile = open("RAxML_bestTree." + uniqueTag, "r")
        tree = treeFile.readline()
        treeFile.close()
    except:
        tree = "NA\n"
    #remove files
    if not test:
        os.system("rm temp." + uniqueTag + ".phy*")
        os.system("rm RAxML*" + uniqueTag)
    return tree
def phymlTree(seqArray, seqNames, model, opt, phyml, prefix = "", tmpDir = None, test = False, log="/dev/null"):
    #write file
    tempAln = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".phy",dir=tmpDir,delete=False)
    with tempAln as tA: tA.write(genomics.makeAlnString(seqNames,seqArray))
    phymlCommand = " ".join([phyml,"--input", tempAln.name,"--model", model, "-o", opt, "-b 0", ">>", log])
    if test: print >> sys.stderr, "phyml command:\n", phymlCommand
    os.system(phymlCommand)
    #try retrieve the result  
    try:
        with open(tempAln.name + "_phyml_tree.txt", "r") as treeFile: tree = treeFile.readline().strip()
    except:
        try:
            with open(tempAln.name + "_phyml_tree", "r") as treeFile: tree = treeFile.readline().strip()
        except:
            if verbose:
                sys.stderr.write("Tree not found at " + tempAln.name + "_phyml_tree.txt\n")
            tree = "NA"
    try:
        with open(tempAln.name + "_phyml_stats.txt", "r") as statsFile:
            stats = statsFile.read().split()
            lnL = stats[stats.index("Log-likelihood:")+1]
    except:
        try:
            with open(tempAln.name + "_phyml_stats", "r") as statsFile:
                stats = statsFile.read().split()
                lnL = stats[stats.index("Log-likelihood:")+1]
        except:
            lnL = "NA"
    #remove files
    if not test: os.system("rm " + tempAln.name + "*")
    return (tree,lnL,)
def phymlCrossVal(seqArray0, seqArray1, indNames, model, opt, phyml, prefix = "",tmpDir=None, test = False, log="/dev/null"):
    #write file
    tempAln0 = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".0.phy",dir=tmpDir,delete=False)
    tempAln1 = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".1.phy",dir=tmpDir,delete=False)
    localName0 = tempAln0.name.rsplit("/",1)[1]
    localName1 = tempAln1.name.rsplit("/",1)[1]
    with tempAln0 as tempAln0: tempAln0.write(genomics.makeAlnString(seqNames,seqArray0))
    with tempAln1 as tempAln1: tempAln1.write(genomics.makeAlnString(seqNames,seqArray1))
    #first way validation
    #tree
    phymlCommand = " ".join([phyml,"--input", tempAln0.name,"--model", model, "-o", opt, ">>", log])
    os.system(phymlCommand)
    #validation
    phymlCommand = " ".join([phyml,"--input", tempAln1.name,"--model", model, "-o", "n", "-u", tempAln0.name + "_phyml_tree.txt", ">>", log])
    os.system(phymlCommand)
    #retrieve
    try:
        with open(tempAln1.name + "_phyml_stats.txt", "r") as statsFile:
            stats = statsFile.read().split()
            lnL1 = float(stats[stats.index("Log-likelihood:")+1])
    except: lnL1 = np.NaN
    #second way validation
    #tree
    phymlCommand = " ".join([phyml,"--input", tempAln1.name,"--model", model, "-o", opt, ">>", log])
    os.system(phymlCommand)
    #validation
    phymlCommand = " ".join([phyml,"--input", tempAln0.name,"--model", model, "-o", "n", "-u", tempAln1.name + "_phyml_tree.txt", ">>", log])
    os.system(phymlCommand)
    #retrieve
    try:
        with open(tempAln0.name + "_phyml_stats.txt", "r") as statsFile:
            stats = statsFile.read().split()
            lnL0 = float(stats[stats.index("Log-likelihood:")+1])
    except: lnL0 = np.NaN
    #remove files
    if not test:
        for f in [f for f in os.listdir(tmpDir) if localName0 in f or localName1 in f]:
            os.remove(tmpDir + "/" + f)
    return str(lnL0+lnL1)
Пример #4
0
def raxTree(seqArray,
            seqNames,
            model,
            raxml,
            outgroup=None,
            prefix="",
            test=False,
            log="/dev/null"):
    #temp file
    tempAln = tempfile.NamedTemporaryFile(mode="w",
                                          prefix=prefix,
                                          suffix=".phy",
                                          dir=".",
                                          delete=False)
    localName = tempAln.name.rsplit("/", 1)[1]
    with tempAln as tA:
        tA.write(genomics.makeAlnString(seqNames, seqArray))
    if outgroup is not None:
        og = " -o " + ",".join(outgroup)
    else:
        og = ""
    #raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent"
    raxCommand = raxml + " -s " + tempAln.name + " -n " + localName + " -m " + model + og + " -V -f d -p 12345 --silent >>" + log
    if test: print >> sys.stderr, "raxml command:\n", raxCommand
    os.system(raxCommand)
    #try retrieve the result
    try:
        treeFile = open("RAxML_bestTree." + localName, "r")
        tree = treeFile.readline()
        treeFile.close()
    except:
        tree = "NA\n"
    #remove files
    if not test:
        for f in [f for f in os.listdir(".") if localName in f]:
            os.remove(f)
    return tree
Пример #5
0
        sys.stderr.write("Getting region " + region + " from geno file...\n")
        genoStream = subprocess.Popen(['tabix', '-h', args.genoFile, region],
                                      stdout=subprocess.PIPE)
        window = genomics.parseGenoFile(genoStream.stdout,
                                        names=args.samples,
                                        includePositions=True,
                                        splitPhased=args.split)
        seqDict = window.seqDict()
        seqNames = seqDict.keys()
        sys.stderr.write("Extracting CDS...\n")
        CDSseqs = [
            genomics.CDS(seqDict[name], window.positions,
                         geneData[scaffold][mRNA]['cdsStarts'],
                         geneData[scaffold][mRNA]['cdsEnds'],
                         geneData[scaffold][mRNA]['strand'])
            for name in seqNames
        ]

        outputNames = [
            name + "_" + mRNA + " " + scaffold + " " +
            str(geneData[scaffold][mRNA]['start']) + "-" +
            str(geneData[scaffold][mRNA]['end']) for name in seqNames
        ]
        fastaString = genomics.makeAlnString(outputNames,
                                             CDSseqs,
                                             outFormat="fasta",
                                             lineLen=None)
        outFile.write(fastaString + "\n")

outFile.close()
Пример #6
0
if args.regionsFile:
        with open(args.regionsFile, "r") as rf:
            for line in rf: regions.append(parseRegionList(line.split()))

#only filter and chop sequences if necessary
if len(regions) >= 1:
    outNames = []
    outSeqs = []
    for seqName,start,end,ori in regions:
        i = names.index(seqName)
        outNames.append(seqName)
        if start != None or end != None or ori == "-":
            seqLen = len(seqs[i])
            if start == None: start = 1
            if end == None: end = seqLen
            start = max(1,start-args.extendLeft)
            end = min(seqLen,end+args.extendRight)
            if ori == "-": outSeqs.append(genomics.revTrans(seqs[i][start-1:end]))
            else: outSeqs.append(seqs[i][start-1:end])
            if not args.preserveNames: outNames[-1] = outNames[-1] + ":" + str(start) + "-" + str(end) + ":" + ori
        else: outSeqs.append(seqs[i])
else:
    outNames = names
    outSeqs = seqs

sys.stderr.write("\nWriting %i sequences.\n" %len(outNames))

sys.stdout.write(genomics.makeAlnString(names=outNames,seqs=outSeqs,outFormat=outFormat,lineLen=l))


Пример #7
0
            #now update if there is data for this window
            for siteData in reader.siteBySite(
                    asDict=True if args.samples else False):
                GTs = [siteData["GTs"][name] for name in args.samples
                       ] if args.samples else siteData["GTs"]
                siteGTdict[siteData["position"]] = [
                    genomics.complement(gt) for gt in GTs
                ] if strand == "-" else GTs

        cdsPositions = genomics.CDSpositions(cdsStarts, cdsEnds, strand)

        cdsSeqs = [[siteGTdict[position][i] for position in cdsPositions]
                   for i in range(nSeqs)]

        if args.outFormat == "fasta":
            outputNames = [
                name + "_" + mRNA + " " + scaffold + " " +
                str(geneData[scaffold][mRNA]['start']) + "-" +
                str(geneData[scaffold][mRNA]['end']) for name in reader.names
            ]
        else:
            outputNames = [name + "_" + mRNA for name in reader.names]

        alnString = genomics.makeAlnString(outputNames,
                                           cdsSeqs,
                                           outFormat=args.outFormat,
                                           lineLen=None)
        outFile.write(alnString + "\n")

outFile.close()
Пример #8
0
if args.phylipIn: inFormat = "phylip"
else: inFormat = "fasta"

if args.phylipOut: outFormat = "phylip"
else: outFormat = "fasta"

l = args.lineLen

########################################################################

allText = sys.stdin.read()

if inFormat == "fasta": names, seqs = genomics.parseFasta(allText)
else: names, seqs = genomics.parsePhylip(allText)

if args.truncateNames: names = [name.split()[0] for name in names]

if args.seqNames:
    outNames = args.seqNames.split(",")
    keep = [names.index(name) for name in outNames]
    names = [names[k] for k in keep]
    seqs = [seqs[k] for k in keep]

if args.start or args.end: seqs = [s[args.start - 1:args.end] for s in seqs]

sys.stdout.write(
    genomics.makeAlnString(names=names,
                           seqs=seqs,
                           outFormat=outFormat,
                           lineLen=l))
Пример #9
0
#############################

samples = args.samples.split(",") if args.samples else None

# if cating all contigs, just parse file and write
if args.mode == "cat":
    #read file into window like object
    window = genomics.parseGenoFile(genoFile,
                                    names=samples,
                                    splitPhased=args.splitPhased)
    #write
    seqDict = window.seqDict()
    seqFile.write(
        genomics.makeAlnString(window.names,
                               [seqDict[name] for name in window.names],
                               outFormat=args.format))
    genoFile.close()
    seqFile.close()
    exit()

if args.mode == "windows" or args.mode == "contigs":
    if args.mode == "windows":
        windType = args.windType
        windSize = args.windSize
        minSites = args.minSites
        stepSize = args.stepSize
        overlap = args.overlap
        maxDist = args.maxDist
    else:
        #to get contigs, we just use very lare non-overlapping coordinate windows