def raxTree(seqBlock, indNames, model, raxml, outgroup = None, uniqueTag = "", test = False, log="/dev/null"): #write file tempAln = open("temp." + uniqueTag + ".phy", "w") tempAln.write(genomics.makeAlnString(indNames,seqBlock)) tempAln.close() if outgroup is not None: og = " -o " + ",".join(outgroup) else: og = "" #raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent" raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent >>" + log if test: print >> sys.stderr, "raxml command:\n", raxCommand os.system(raxCommand) tempAln.close() #try retrieve the result try: treeFile = open("RAxML_bestTree." + uniqueTag, "r") tree = treeFile.readline() treeFile.close() except: tree = "NA\n" #remove files if not test: os.system("rm temp." + uniqueTag + ".phy*") os.system("rm RAxML*" + uniqueTag) return tree
def phymlTree(seqArray, seqNames, model, opt, phyml, prefix = "", tmpDir = None, test = False, log="/dev/null"): #write file tempAln = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".phy",dir=tmpDir,delete=False) with tempAln as tA: tA.write(genomics.makeAlnString(seqNames,seqArray)) phymlCommand = " ".join([phyml,"--input", tempAln.name,"--model", model, "-o", opt, "-b 0", ">>", log]) if test: print >> sys.stderr, "phyml command:\n", phymlCommand os.system(phymlCommand) #try retrieve the result try: with open(tempAln.name + "_phyml_tree.txt", "r") as treeFile: tree = treeFile.readline().strip() except: try: with open(tempAln.name + "_phyml_tree", "r") as treeFile: tree = treeFile.readline().strip() except: if verbose: sys.stderr.write("Tree not found at " + tempAln.name + "_phyml_tree.txt\n") tree = "NA" try: with open(tempAln.name + "_phyml_stats.txt", "r") as statsFile: stats = statsFile.read().split() lnL = stats[stats.index("Log-likelihood:")+1] except: try: with open(tempAln.name + "_phyml_stats", "r") as statsFile: stats = statsFile.read().split() lnL = stats[stats.index("Log-likelihood:")+1] except: lnL = "NA" #remove files if not test: os.system("rm " + tempAln.name + "*") return (tree,lnL,)
def phymlCrossVal(seqArray0, seqArray1, indNames, model, opt, phyml, prefix = "",tmpDir=None, test = False, log="/dev/null"): #write file tempAln0 = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".0.phy",dir=tmpDir,delete=False) tempAln1 = tempfile.NamedTemporaryFile(mode="w",prefix=prefix,suffix=".1.phy",dir=tmpDir,delete=False) localName0 = tempAln0.name.rsplit("/",1)[1] localName1 = tempAln1.name.rsplit("/",1)[1] with tempAln0 as tempAln0: tempAln0.write(genomics.makeAlnString(seqNames,seqArray0)) with tempAln1 as tempAln1: tempAln1.write(genomics.makeAlnString(seqNames,seqArray1)) #first way validation #tree phymlCommand = " ".join([phyml,"--input", tempAln0.name,"--model", model, "-o", opt, ">>", log]) os.system(phymlCommand) #validation phymlCommand = " ".join([phyml,"--input", tempAln1.name,"--model", model, "-o", "n", "-u", tempAln0.name + "_phyml_tree.txt", ">>", log]) os.system(phymlCommand) #retrieve try: with open(tempAln1.name + "_phyml_stats.txt", "r") as statsFile: stats = statsFile.read().split() lnL1 = float(stats[stats.index("Log-likelihood:")+1]) except: lnL1 = np.NaN #second way validation #tree phymlCommand = " ".join([phyml,"--input", tempAln1.name,"--model", model, "-o", opt, ">>", log]) os.system(phymlCommand) #validation phymlCommand = " ".join([phyml,"--input", tempAln0.name,"--model", model, "-o", "n", "-u", tempAln1.name + "_phyml_tree.txt", ">>", log]) os.system(phymlCommand) #retrieve try: with open(tempAln0.name + "_phyml_stats.txt", "r") as statsFile: stats = statsFile.read().split() lnL0 = float(stats[stats.index("Log-likelihood:")+1]) except: lnL0 = np.NaN #remove files if not test: for f in [f for f in os.listdir(tmpDir) if localName0 in f or localName1 in f]: os.remove(tmpDir + "/" + f) return str(lnL0+lnL1)
def raxTree(seqArray, seqNames, model, raxml, outgroup=None, prefix="", test=False, log="/dev/null"): #temp file tempAln = tempfile.NamedTemporaryFile(mode="w", prefix=prefix, suffix=".phy", dir=".", delete=False) localName = tempAln.name.rsplit("/", 1)[1] with tempAln as tA: tA.write(genomics.makeAlnString(seqNames, seqArray)) if outgroup is not None: og = " -o " + ",".join(outgroup) else: og = "" #raxCommand = raxml + " -s temp." + uniqueTag + ".phy -n " + uniqueTag + " -m " + model + og + " -V -f d -p 12345 --silent" raxCommand = raxml + " -s " + tempAln.name + " -n " + localName + " -m " + model + og + " -V -f d -p 12345 --silent >>" + log if test: print >> sys.stderr, "raxml command:\n", raxCommand os.system(raxCommand) #try retrieve the result try: treeFile = open("RAxML_bestTree." + localName, "r") tree = treeFile.readline() treeFile.close() except: tree = "NA\n" #remove files if not test: for f in [f for f in os.listdir(".") if localName in f]: os.remove(f) return tree
sys.stderr.write("Getting region " + region + " from geno file...\n") genoStream = subprocess.Popen(['tabix', '-h', args.genoFile, region], stdout=subprocess.PIPE) window = genomics.parseGenoFile(genoStream.stdout, names=args.samples, includePositions=True, splitPhased=args.split) seqDict = window.seqDict() seqNames = seqDict.keys() sys.stderr.write("Extracting CDS...\n") CDSseqs = [ genomics.CDS(seqDict[name], window.positions, geneData[scaffold][mRNA]['cdsStarts'], geneData[scaffold][mRNA]['cdsEnds'], geneData[scaffold][mRNA]['strand']) for name in seqNames ] outputNames = [ name + "_" + mRNA + " " + scaffold + " " + str(geneData[scaffold][mRNA]['start']) + "-" + str(geneData[scaffold][mRNA]['end']) for name in seqNames ] fastaString = genomics.makeAlnString(outputNames, CDSseqs, outFormat="fasta", lineLen=None) outFile.write(fastaString + "\n") outFile.close()
if args.regionsFile: with open(args.regionsFile, "r") as rf: for line in rf: regions.append(parseRegionList(line.split())) #only filter and chop sequences if necessary if len(regions) >= 1: outNames = [] outSeqs = [] for seqName,start,end,ori in regions: i = names.index(seqName) outNames.append(seqName) if start != None or end != None or ori == "-": seqLen = len(seqs[i]) if start == None: start = 1 if end == None: end = seqLen start = max(1,start-args.extendLeft) end = min(seqLen,end+args.extendRight) if ori == "-": outSeqs.append(genomics.revTrans(seqs[i][start-1:end])) else: outSeqs.append(seqs[i][start-1:end]) if not args.preserveNames: outNames[-1] = outNames[-1] + ":" + str(start) + "-" + str(end) + ":" + ori else: outSeqs.append(seqs[i]) else: outNames = names outSeqs = seqs sys.stderr.write("\nWriting %i sequences.\n" %len(outNames)) sys.stdout.write(genomics.makeAlnString(names=outNames,seqs=outSeqs,outFormat=outFormat,lineLen=l))
#now update if there is data for this window for siteData in reader.siteBySite( asDict=True if args.samples else False): GTs = [siteData["GTs"][name] for name in args.samples ] if args.samples else siteData["GTs"] siteGTdict[siteData["position"]] = [ genomics.complement(gt) for gt in GTs ] if strand == "-" else GTs cdsPositions = genomics.CDSpositions(cdsStarts, cdsEnds, strand) cdsSeqs = [[siteGTdict[position][i] for position in cdsPositions] for i in range(nSeqs)] if args.outFormat == "fasta": outputNames = [ name + "_" + mRNA + " " + scaffold + " " + str(geneData[scaffold][mRNA]['start']) + "-" + str(geneData[scaffold][mRNA]['end']) for name in reader.names ] else: outputNames = [name + "_" + mRNA for name in reader.names] alnString = genomics.makeAlnString(outputNames, cdsSeqs, outFormat=args.outFormat, lineLen=None) outFile.write(alnString + "\n") outFile.close()
if args.phylipIn: inFormat = "phylip" else: inFormat = "fasta" if args.phylipOut: outFormat = "phylip" else: outFormat = "fasta" l = args.lineLen ######################################################################## allText = sys.stdin.read() if inFormat == "fasta": names, seqs = genomics.parseFasta(allText) else: names, seqs = genomics.parsePhylip(allText) if args.truncateNames: names = [name.split()[0] for name in names] if args.seqNames: outNames = args.seqNames.split(",") keep = [names.index(name) for name in outNames] names = [names[k] for k in keep] seqs = [seqs[k] for k in keep] if args.start or args.end: seqs = [s[args.start - 1:args.end] for s in seqs] sys.stdout.write( genomics.makeAlnString(names=names, seqs=seqs, outFormat=outFormat, lineLen=l))
############################# samples = args.samples.split(",") if args.samples else None # if cating all contigs, just parse file and write if args.mode == "cat": #read file into window like object window = genomics.parseGenoFile(genoFile, names=samples, splitPhased=args.splitPhased) #write seqDict = window.seqDict() seqFile.write( genomics.makeAlnString(window.names, [seqDict[name] for name in window.names], outFormat=args.format)) genoFile.close() seqFile.close() exit() if args.mode == "windows" or args.mode == "contigs": if args.mode == "windows": windType = args.windType windSize = args.windSize minSites = args.minSites stepSize = args.stepSize overlap = args.overlap maxDist = args.maxDist else: #to get contigs, we just use very lare non-overlapping coordinate windows