示例#1
0
文件: tree.py 项目: jnktsj/norg-suite
def fastaToPhylip(infile, outfile):
    f = open(outfile, "w")
    marks = string.maketrans(" \t()[]:;,", "________|")
    orgs = [[name, seq] for name, seq in readFasta(infile)]
    maxlen = max(map(len, zip(*orgs)[0]))
    seqlen = len(orgs[0][1])
    f.write(str(len(orgs)) + " " + str(seqlen) + "\n")
    for o in orgs:
        space = " " + " " * (maxlen - len(o[0]) + 3)
        f.write(o[0].translate(marks) + space + o[1] + "\n")
    f.close()
示例#2
0
def fastaToPhylip(infile, outfile):
    f = open(outfile, "w")
    marks = string.maketrans(" \t()[]:;,", "________|")
    orgs = [[name, seq] for name, seq in readFasta(infile)]
    maxlen = max(map(len, zip(*orgs)[0]))
    seqlen = len(orgs[0][1])
    f.write(str(len(orgs)) + " " + str(seqlen) + "\n")
    for o in orgs:
        space = " " + " " * (maxlen - len(o[0]) + 3)
        f.write(o[0].translate(marks) + space + o[1] + "\n")
    f.close()
示例#3
0
def nucTree(prog, opts, f, refSeq, wd, reftree, organelle):

    orgGenomes = len(refSeq)

    files = []
    id = strftime("%H%M%S_nuc", gmtime())
    revcmp = string.maketrans("ACGTNSWRYKMBDHVacgtnswrykmbdhv",
                              "TGCANSWYRMKVHDBtgcanswyrmkvhdb")
    # generate multi-fasta sequence files
    if opts.verbose:
        print prog + ": create fasta file(s): insertions and corresponding organelle subsequences"

    for name, seq in readFasta(f):
        coord = name.split(":")
        beg = map(int, coord[4].split(","))
        end = map(int, coord[5].split(","))
        refBeg, refEnd = findRefIndex(beg, end, refSeq[0][1])
        if coord[-1] == "-":
            seq = seq[::-1].translate(revcmp)
        file = wd + "/" + name + ".mfa"
        fp = open(file, "w")
        chunk = len(refBeg)
        writeFasta(">" + name, seq, fp)
        for i in range(orgGenomes):
            subseq = ""
            for j in range(chunk):
                subseq += refSeq[i][1][refBeg[j]:refEnd[j]]
            if subseq:
                writeFasta(">" + refSeq[i][0], subseq.replace("-", ""), fp)
        fp.close()
        files.append(file)

    # compute multiple sequence alignments
    flen = len(files)
    tmp = wd + "/" + id
    if opts.verbose:
        print prog + ": compute multiple sequence alignments of insertion sequences"
    for i in range(flen):
        msa = files[i].split(".")[0] + ".msa"
        exeMafft(opts.mafft, "--localpair", files[i], tmp, seqLen(files[i]))
        fastaToPhylip(tmp, msa)
        files[i] = msa
    commands.getoutput("rm " + tmp)

    # calculate phylogenetic trees
    if opts.verbose:
        print prog + ": calculate phylogenetic trees of insertion sequences"
    for i in range(flen):
        ptr = files[i].split(".")[0] + ".ptr"
        exeRaxml(opts.raxml, files[i], ptr, id, wd, reftree, organelle)
        files[i] = ptr
    return files
示例#4
0
文件: tree.py 项目: jnktsj/norg-suite
def nucTree(prog, opts, f, refSeq, wd, reftree, organelle):

    orgGenomes = len(refSeq)

    files = []
    id = strftime("%H%M%S_nuc", gmtime())
    revcmp = string.maketrans("ACGTNSWRYKMBDHVacgtnswrykmbdhv",
                              "TGCANSWYRMKVHDBtgcanswyrmkvhdb")
    # generate multi-fasta sequence files
    if opts.verbose:
        print prog + ": create fasta file(s): insertions and corresponding organelle subsequences"

    for name, seq in readFasta(f):
        coord = name.split(":")
        beg = map(int, coord[4].split(","))
        end = map(int, coord[5].split(","))
        refBeg, refEnd = findRefIndex(beg, end, refSeq[0][1])
        if coord[-1] == "-":
            seq = seq[::-1].translate(revcmp)
        file = wd + "/" + name + ".mfa"
        fp = open(file, "w")
        chunk = len(refBeg)
        writeFasta(">" + name, seq, fp)
        for i in range(orgGenomes):
            subseq = ""
            for j in range(chunk):
                subseq += refSeq[i][1][refBeg[j]:refEnd[j]]
            if subseq:
                writeFasta(">" + refSeq[i][0], subseq.replace("-", ""), fp)
        fp.close()
        files.append(file)

    # compute multiple sequence alignments
    flen = len(files)
    tmp = wd + "/" + id
    if opts.verbose:
        print prog + ": compute multiple sequence alignments of insertion sequences"
    for i in range(flen):
        msa = files[i].split(".")[0] + ".msa"
        exeMafft(opts.mafft, "--localpair", files[i], tmp, seqLen(files[i]))
        fastaToPhylip(tmp, msa)
        files[i] = msa
    commands.getoutput("rm " + tmp)

    # calculate phylogenetic trees
    if opts.verbose:
        print prog + ": calculate phylogenetic trees of insertion sequences"
    for i in range(flen):
        ptr = files[i].split(".")[0] + ".ptr"
        exeRaxml(opts.raxml, files[i], ptr, id, wd, reftree, organelle)
        files[i] = ptr
    return files
示例#5
0
def extractBaitSeq(baitSeq, baitRev, genome, coord):
    fs = open(baitSeq, "w")
    fr = open(baitRev, "w")
    for name, seq in readFasta(genome):
        c = editBaitCoord(coord.get(name, []), 200)
        if not c:
            continue
        for n, s in extractSeq(name, c, seq, True):
            beg, end = n.split(":")[1:]
            n = ">" + name + ":" + ",".join(beg.split(",")[1:-1]) + ":" + ",".join(end.split(",")[1:-1])
            writeFasta(n, s, fs)
            writeFasta(n, s[::-1], fr)
    fs.close()
    fr.close()
示例#6
0
def extractBaitSeq(baitSeq, baitRev, genome, coord):
    fs = open(baitSeq, "w")
    fr = open(baitRev, "w")
    for name, seq in readFasta(genome):
        c = editBaitCoord(coord.get(name, []), 200)
        if not c: continue
        for n, s in extractSeq(name, c, seq, True):
            beg, end = n.split(":")[1:]
            n = ">" + name + ":" + \
                ",".join(beg.split(",")[1:-1]) + ":" + \
                ",".join(end.split(",")[1:-1])
            writeFasta(n, s, fs)
            writeFasta(n, s[::-1], fr)
    fs.close()
    fr.close()
示例#7
0
def extractFlankSeq(flankSeq, flankRev, flanklen, hitDict, nucleus):
    fs = open(flankSeq, "w")
    fr = open(flankRev, "w")
    wrote = 0
    mark = string.maketrans(":", "^")
    for name, seq in readFasta(nucleus):
        coord = makeFlankCoord(hitDict.get(name, []), flanklen)
        if not coord: continue
        for n, s in extractSeq(name, coord, seq, True):
            beg, end = n.split(":")[1:]
            n = ">" + name.translate(mark) + ":" + \
                beg.split(",")[-1] + ":" + end.split(",")[0]
            if len(s):
                writeFasta(n, s, fs)
                writeFasta(n, s[::-1], fr)
                wrote += 1
    fs.close()
    fr.close()
    return wrote
示例#8
0
def extractFlankSeq(flankSeq, flankRev, flanklen, hitDict, nucleus):
    fs = open(flankSeq, "w")
    fr = open(flankRev, "w")
    wrote = 0
    mark = string.maketrans(":", "^")
    for name, seq in readFasta(nucleus):
        coord = makeFlankCoord(hitDict.get(name, []), flanklen)
        if not coord: continue
        for n, s in extractSeq(name, coord, seq, True):
            beg, end = n.split(":")[1:]
            n = ">" + name.translate(mark) + ":" + \
                beg.split(",")[-1] + ":" + end.split(",")[0]
            if len(s):
                writeFasta(n, s, fs)
                writeFasta(n, s[::-1], fr)
                wrote += 1
    fs.close()
    fr.close()
    return wrote
示例#9
0
def align(prog, opts, args, wd):

    hitDict = {}
    hitCounts = 0

    # command line format strings
    tantan = opts.tantan + "tantan %s > %s"
    lastdb = opts.last   + "lastdb %s %s %s"
    lastal = opts.last   + "lastal -e%s -j4 -f0 %s %s | grep -v '#'"
    lastex = opts.last   + "lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1"

    # (soft-masked) sequence file names
    organelle = wd + "/orgSeq"
    nucleus = wd + "/nucSeq"

    # index file names
    nucBaseFreq = wd + "/nucBaseFreq"
    orgIndex = wd + "/orgIndex"

    # simulation file names
    orgRev = wd + "/orgRev"
    orgRevIndex = wd + "/orgRevIndex"

    # double circular organelle genome
    if int(commands.getoutput("grep -c '>' " + args[0])) != 1:
        raise Exception("there must be exactly 1 sequence: " + args[0])
    commands.getoutput("cp " + args[0] + " " + organelle)
    if opts.dnaform == 'circular':
        if opts.verbose:
            print prog + ": double organelle sequence"
        commands.getoutput("grep -v '>' " + args[0] + " >> " + organelle)

    # soft-mask genomes with 'tantan'
    if opts.rmsk_organelle:
        if opts.verbose:
            print prog + ": soft-mask organelle genome"
        tmpfile = wd + "/tmp"
        commands.getoutput(tantan % (organelle, tmpfile))
        commands.getoutput("mv " + tmpfile + " " + organelle)
    if opts.rmsk_nucleus:
        if opts.verbose:
            print prog + ": soft-mask nuclear genome"
        commands.getoutput(tantan % (args[1], nucleus))
    else: nucleus = args[1]

    # index genomes with 'lastdb'
    if opts.rmsk_organelle or opts.rmsk_nucleus:
        lastdbOption = " "
    else:
        lastdbOption = "-c"
    commands.getoutput(lastdb % ("-x", nucBaseFreq, nucleus))
    commands.getoutput(lastdb % (lastdbOption, orgIndex, organelle))

    # calculate (and simulate) E-value
    if opts.evalue > 0: opts.greedy = False
    else              : opts.evalue = 0.01
    evalue = "%.2e" % opts.evalue
    score = commands.getoutput(lastex % (evalue, orgIndex, nucBaseFreq))
    if opts.greedy:
        if opts.verbose:
            print prog + ": simulate E-value threshold"
        f = open(orgRev, "w")
        for name, seq in readFasta(organelle):
            writeFasta(">" + name, seq[::-1], f)
        f.close()
        commands.getoutput(lastdb % (lastdbOption, orgRevIndex, orgRev))
        score, evalue = evalueSimulation(lastex, lastal, orgRevIndex,
                                         nucBaseFreq, nucleus, score, evalue)
        commands.getoutput("rm " + orgRev + " " + orgRevIndex + "*")

    # align organelle and nuclear genomes
    if opts.verbose:
        print prog + ": set: e-value=" + evalue + " / score=" + score
        print prog + ": start alignment"
    alignResult = commands.getoutput(lastal % (score, orgIndex, nucleus))
    if not alignResult:
        raise Exception("no hits are found")
    
    # format result
    hitDict = formatResult(alignResult)
    
    # merge overlapped hits
    if opts.verbose:
        print prog + ": cull overlapped alignments"
    for chrom in hitDict.keys():
        hitDict[chrom].sort()
        hitDict[chrom] = mergeOverlap(hitDict[chrom])
    hitCounts = sum(map(len, hitDict.values()))
    
    return hitDict, hitCounts, organelle, nucleus