def fastaToPhylip(infile, outfile): f = open(outfile, "w") marks = string.maketrans(" \t()[]:;,", "________|") orgs = [[name, seq] for name, seq in readFasta(infile)] maxlen = max(map(len, zip(*orgs)[0])) seqlen = len(orgs[0][1]) f.write(str(len(orgs)) + " " + str(seqlen) + "\n") for o in orgs: space = " " + " " * (maxlen - len(o[0]) + 3) f.write(o[0].translate(marks) + space + o[1] + "\n") f.close()
def nucTree(prog, opts, f, refSeq, wd, reftree, organelle): orgGenomes = len(refSeq) files = [] id = strftime("%H%M%S_nuc", gmtime()) revcmp = string.maketrans("ACGTNSWRYKMBDHVacgtnswrykmbdhv", "TGCANSWYRMKVHDBtgcanswyrmkvhdb") # generate multi-fasta sequence files if opts.verbose: print prog + ": create fasta file(s): insertions and corresponding organelle subsequences" for name, seq in readFasta(f): coord = name.split(":") beg = map(int, coord[4].split(",")) end = map(int, coord[5].split(",")) refBeg, refEnd = findRefIndex(beg, end, refSeq[0][1]) if coord[-1] == "-": seq = seq[::-1].translate(revcmp) file = wd + "/" + name + ".mfa" fp = open(file, "w") chunk = len(refBeg) writeFasta(">" + name, seq, fp) for i in range(orgGenomes): subseq = "" for j in range(chunk): subseq += refSeq[i][1][refBeg[j]:refEnd[j]] if subseq: writeFasta(">" + refSeq[i][0], subseq.replace("-", ""), fp) fp.close() files.append(file) # compute multiple sequence alignments flen = len(files) tmp = wd + "/" + id if opts.verbose: print prog + ": compute multiple sequence alignments of insertion sequences" for i in range(flen): msa = files[i].split(".")[0] + ".msa" exeMafft(opts.mafft, "--localpair", files[i], tmp, seqLen(files[i])) fastaToPhylip(tmp, msa) files[i] = msa commands.getoutput("rm " + tmp) # calculate phylogenetic trees if opts.verbose: print prog + ": calculate phylogenetic trees of insertion sequences" for i in range(flen): ptr = files[i].split(".")[0] + ".ptr" exeRaxml(opts.raxml, files[i], ptr, id, wd, reftree, organelle) files[i] = ptr return files
def extractBaitSeq(baitSeq, baitRev, genome, coord): fs = open(baitSeq, "w") fr = open(baitRev, "w") for name, seq in readFasta(genome): c = editBaitCoord(coord.get(name, []), 200) if not c: continue for n, s in extractSeq(name, c, seq, True): beg, end = n.split(":")[1:] n = ">" + name + ":" + ",".join(beg.split(",")[1:-1]) + ":" + ",".join(end.split(",")[1:-1]) writeFasta(n, s, fs) writeFasta(n, s[::-1], fr) fs.close() fr.close()
def extractBaitSeq(baitSeq, baitRev, genome, coord): fs = open(baitSeq, "w") fr = open(baitRev, "w") for name, seq in readFasta(genome): c = editBaitCoord(coord.get(name, []), 200) if not c: continue for n, s in extractSeq(name, c, seq, True): beg, end = n.split(":")[1:] n = ">" + name + ":" + \ ",".join(beg.split(",")[1:-1]) + ":" + \ ",".join(end.split(",")[1:-1]) writeFasta(n, s, fs) writeFasta(n, s[::-1], fr) fs.close() fr.close()
def extractFlankSeq(flankSeq, flankRev, flanklen, hitDict, nucleus): fs = open(flankSeq, "w") fr = open(flankRev, "w") wrote = 0 mark = string.maketrans(":", "^") for name, seq in readFasta(nucleus): coord = makeFlankCoord(hitDict.get(name, []), flanklen) if not coord: continue for n, s in extractSeq(name, coord, seq, True): beg, end = n.split(":")[1:] n = ">" + name.translate(mark) + ":" + \ beg.split(",")[-1] + ":" + end.split(",")[0] if len(s): writeFasta(n, s, fs) writeFasta(n, s[::-1], fr) wrote += 1 fs.close() fr.close() return wrote
def align(prog, opts, args, wd): hitDict = {} hitCounts = 0 # command line format strings tantan = opts.tantan + "tantan %s > %s" lastdb = opts.last + "lastdb %s %s %s" lastal = opts.last + "lastal -e%s -j4 -f0 %s %s | grep -v '#'" lastex = opts.last + "lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1" # (soft-masked) sequence file names organelle = wd + "/orgSeq" nucleus = wd + "/nucSeq" # index file names nucBaseFreq = wd + "/nucBaseFreq" orgIndex = wd + "/orgIndex" # simulation file names orgRev = wd + "/orgRev" orgRevIndex = wd + "/orgRevIndex" # double circular organelle genome if int(commands.getoutput("grep -c '>' " + args[0])) != 1: raise Exception("there must be exactly 1 sequence: " + args[0]) commands.getoutput("cp " + args[0] + " " + organelle) if opts.dnaform == 'circular': if opts.verbose: print prog + ": double organelle sequence" commands.getoutput("grep -v '>' " + args[0] + " >> " + organelle) # soft-mask genomes with 'tantan' if opts.rmsk_organelle: if opts.verbose: print prog + ": soft-mask organelle genome" tmpfile = wd + "/tmp" commands.getoutput(tantan % (organelle, tmpfile)) commands.getoutput("mv " + tmpfile + " " + organelle) if opts.rmsk_nucleus: if opts.verbose: print prog + ": soft-mask nuclear genome" commands.getoutput(tantan % (args[1], nucleus)) else: nucleus = args[1] # index genomes with 'lastdb' if opts.rmsk_organelle or opts.rmsk_nucleus: lastdbOption = " " else: lastdbOption = "-c" commands.getoutput(lastdb % ("-x", nucBaseFreq, nucleus)) commands.getoutput(lastdb % (lastdbOption, orgIndex, organelle)) # calculate (and simulate) E-value if opts.evalue > 0: opts.greedy = False else : opts.evalue = 0.01 evalue = "%.2e" % opts.evalue score = commands.getoutput(lastex % (evalue, orgIndex, nucBaseFreq)) if opts.greedy: if opts.verbose: print prog + ": simulate E-value threshold" f = open(orgRev, "w") for name, seq in readFasta(organelle): writeFasta(">" + name, seq[::-1], f) f.close() commands.getoutput(lastdb % (lastdbOption, orgRevIndex, orgRev)) score, evalue = evalueSimulation(lastex, lastal, orgRevIndex, nucBaseFreq, nucleus, score, evalue) commands.getoutput("rm " + orgRev + " " + orgRevIndex + "*") # align organelle and nuclear genomes if opts.verbose: print prog + ": set: e-value=" + evalue + " / score=" + score print prog + ": start alignment" alignResult = commands.getoutput(lastal % (score, orgIndex, nucleus)) if not alignResult: raise Exception("no hits are found") # format result hitDict = formatResult(alignResult) # merge overlapped hits if opts.verbose: print prog + ": cull overlapped alignments" for chrom in hitDict.keys(): hitDict[chrom].sort() hitDict[chrom] = mergeOverlap(hitDict[chrom]) hitCounts = sum(map(len, hitDict.values())) return hitDict, hitCounts, organelle, nucleus