Exemplo n.º 1
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.set_outdir(outdir="outdir")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Exemplo n.º 2
0
def minimap(args):
    """
    %prog minimap ref.fasta query.fasta

    Wrap minimap2 aligner using query against sequences. When query and ref
    is the same, we are in "self-scan" mode (e.g. useful for finding internal
    duplications resulted from mis-assemblies).
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.formats.fasta import Fasta

    p = OptionParser(minimap.__doc__)
    p.add_option(
        "--chunks",
        type="int",
        default=2000000,
        help="Split ref.fasta into chunks of size in self-scan mode",
    )
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    chunks = opts.chunks
    outdir = opts.outdir
    if ref != query:
        raise NotImplementedError

    # "self-scan" mode
    # build faidx (otherwise, parallel make may complain)
    sh("samtools faidx {}".format(ref))
    f = Fasta(ref)
    mkdir(outdir)
    mm = MakeManager()
    for name, size in f.itersizes():
        start = 0
        for end in range(chunks, size, chunks):
            fafile = op.join(outdir,
                             "{}_{}_{}.fa".format(name, start + 1, end))
            cmd = "samtools faidx {} {}:{}-{} -o {}".format(
                ref, name, start + 1, end, fafile)
            mm.add(ref, fafile, cmd)

            paffile = fafile.rsplit(".", 1)[0] + ".paf"
            cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile)
            mm.add(fafile, paffile, cmd)

            epsfile = fafile.rsplit(".", 1)[0] + ".eps"
            cmd = "minidot {} > {}".format(paffile, epsfile)
            mm.add(paffile, epsfile, cmd)
            start += chunks

    mm.write()
Exemplo n.º 3
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + [
        "--seqtk",
        "--outdir={0}".format(opts.outdir),
        "--outfile={0}".format(prefix + ".fasta"),
    ])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid,
                         cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile,
                  userfile,
                  notmatchedfile,
                  clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Exemplo n.º 4
0
def fasta(args):
    """
    %prog fasta fastqfiles

    Convert fastq to fasta and qual file.
    """
    p = OptionParser(fasta.__doc__)
    p.add_option("--seqtk",
                 default=False,
                 action="store_true",
                 help="Use seqtk to convert")
    p.set_outdir()
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    outdir = opts.outdir
    if outdir and outdir != ".":
        mkdir(outdir)

    fastqfile = fastqfiles[0]
    pf = op.basename(fastqfile)
    gzinput = pf.endswith(".gz")
    if gzinput:
        pf = pf.rsplit(".", 1)[0]

    pf, sf = pf.rsplit(".", 1)
    if sf not in ("fq", "fastq"):
        logging.debug("Assumed FASTA: suffix not `fq` or `fastq`")
        return fastqfile, None

    fastafile, qualfile = pf + ".fasta", pf + ".qual"
    outfile = opts.outfile or fastafile
    outfile = op.join(outdir, outfile)
    if opts.seqtk:
        if need_update(fastqfiles, outfile):
            for i, fastqfile in enumerate(fastqfiles):
                cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile)
                # First one creates file, following ones append to it
                sh(cmd, outfile=outfile, append=i)
        else:
            logging.debug("Outfile `{0}` already exists.".format(outfile))
        return outfile, None

    for fastqfile in fastqfiles:
        SeqIO.convert(fastqfile, "fastq", fastafile, "fasta")
        SeqIO.convert(fastqfile, "fastq", qualfile, "qual")

    return fastafile, qualfile
Exemplo n.º 5
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + ["--seqtk",
                                "--outdir={0}".format(opts.outdir),
                                "--outfile={0}".format(prefix + ".fasta")])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile, userfile, notmatchedfile, clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Exemplo n.º 6
0
def fasta(args):
    """
    %prog fasta fastqfiles

    Convert fastq to fasta and qual file.
    """
    p = OptionParser(fasta.__doc__)
    p.add_option("--seqtk", default=False, action="store_true",
                 help="Use seqtk to convert")
    p.set_outdir()
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    outdir = opts.outdir
    if outdir and outdir != ".":
        mkdir(outdir)

    fastqfile = fastqfiles[0]
    pf = op.basename(fastqfile)
    gzinput = pf.endswith(".gz")
    if gzinput:
        pf = pf.rsplit(".", 1)[0]

    pf, sf = pf.rsplit(".", 1)
    if sf not in ("fq", "fastq"):
        logging.debug("Assumed FASTA: suffix not `fq` or `fastq`")
        return fastqfile, None

    fastafile, qualfile = pf + ".fasta", pf + ".qual"
    outfile = opts.outfile or fastafile
    outfile = op.join(outdir, outfile)
    if opts.seqtk:
        if need_update(fastqfiles, outfile):
            for i, fastqfile in enumerate(fastqfiles):
                cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile)
                # First one creates file, following ones append to it
                sh(cmd, outfile=outfile, append=i)
        else:
            logging.debug("Outfile `{0}` already exists.".format(outfile))
        return outfile, None

    for fastqfile in fastqfiles:
        SeqIO.convert(fastqfile, "fastq", fastafile, "fasta")
        SeqIO.convert(fastqfile, "fastq", qualfile, "qual")

    return fastafile, qualfile
Exemplo n.º 7
0
def fromsra(args):
    """
    %prog fromsra srafile

    Convert sra file to fastq using the sratoolkit `fastq-dump`
    """
    p = OptionParser(fromsra.__doc__)
    p.add_option(
        "--paired",
        default=False,
        action="store_true",
        help="Specify if library layout is paired-end",
    )
    p.add_option(
        "--compress",
        default=None,
        choices=["gzip", "bzip2"],
        help="Compress output fastq files",
    )
    p.set_outdir()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (srafile, ) = args
    paired = opts.paired
    compress = opts.compress
    outdir = opts.outdir

    script_path = which("fastq-dump")
    if not script_path:
        logging.error("Cannot find `fastq-dump` in the PATH")
        sys.exit()

    cmd = [script_path]
    if compress:
        cmd.append("--{0}".format(compress))
    if paired:
        cmd.append("--split-files")
    if outdir:
        cmd.append("--outdir {0}".format(outdir))
    cmd.append(srafile)

    outcmd = " ".join(cmd)
    sh(outcmd, grid=opts.grid)
Exemplo n.º 8
0
def fromsra(args):
    """
    %prog fromsra srafile

    Convert sra file to fastq using the sratoolkit `fastq-dump`
    """
    p = OptionParser(fromsra.__doc__)
    p.add_option(
        "--paired",
        default=False,
        action="store_true",
        help="Specify if library layout is paired-end " + "[default: %default]",
    )
    p.add_option(
        "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]"
    )
    p.set_outdir()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    srafile, = args
    paired = opts.paired
    compress = opts.compress
    outdir = opts.outdir

    script_path = which("fastq-dump")
    if not script_path:
        logging.error("Cannot find `fastq-dump` in the PATH")
        sys.exit()

    cmd = [script_path]
    if compress:
        cmd.append("--{0}".format(compress))
    if paired:
        cmd.append("--split-3")
    if outdir:
        cmd.append("--outdir {0}".format(outdir))
    cmd.append(srafile)

    outcmd = " ".join(cmd)
    sh(outcmd, grid=opts.grid)
Exemplo n.º 9
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang", default=GoodOverhang, type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl, a.component_span, b.component_span,
                        cutoff, qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
Exemplo n.º 10
0
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.add_option("--rnaseq",
                 default=False,
                 action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--native",
                 default=False,
                 action="store_true",
                 help="Convert GSNAP output to NATIVE format")
    p.set_home("eddyyeh")
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[:2]
    outdir = opts.outdir
    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = op.join(outdir, prefix + ".log")
    gsnapfile = op.join(outdir, prefix + ".gsnap")
    nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native"

    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.native:
        EYHOME = opts.eddyyeh_home
        if need_update(gsnapfile, nativefile):
            cmd = op.join(EYHOME, "convert2native.pl")
            cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
            cmd += " -proc {0}".format(opts.cpus)
            sh(cmd)

    return gsnapfile, logfile
Exemplo n.º 11
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
        "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene")

    p.add_option(
        "--noversion",
        dest="noversion",
        default=False,
        action="store_true",
        help="Remove trailing accession versions",
    )
    p.add_option(
        "--format",
        default="fasta",
        choices=valid_formats,
        help="download format",
    )
    p.add_option(
        "--database",
        default="nuccore",
        choices=valid_databases,
        help="search database",
    )
    p.add_option(
        "--retmax",
        default=1000000,
        type="int",
        help="how many results to return",
    )
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence",
    )
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up",
    )
    p.set_outdir(outdir=None)
    p.add_option("--outprefix", default="out", help="output file name prefix")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (filename,) = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert (
        database in allowed_databases[fmt]
    ), "For output format '{0}', allowed databases are: {1}".format(
        fmt, allowed_databases[fmt]
    )
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(
        list_of_terms,
        retmax=opts.retmax,
        rettype=fmt,
        db=database,
        batchsize=batchsize,
        email=opts.email,
    ):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        printf(
            "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()),
        )

    return outfile
Exemplo n.º 12
0
def last(args, dbtype=None):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option(
        "--dbtype",
        default="nucl",
        choices=("nucl", "prot"),
        help="Molecule type of subject database",
    )
    p.add_option("--path", help="Specify LAST path")
    p.add_option("--mask",
                 default=False,
                 action="store_true",
                 help="Invoke -c in lastdb")
    p.add_option(
        "--format",
        default="BlastTab",
        choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
        help="Output format",
    )
    p.add_option(
        "--minlen",
        default=0,
        type="int",
        help="Filter alignments by how many bases match",
    )
    p.add_option("--minid",
                 default=0,
                 type="int",
                 help="Minimum sequence identity")
    p.set_cpus()
    p.set_outdir()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    if not dbtype:
        dbtype = opts.dbtype
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(
        infile=subject,
        outfile=subjectdb + ".prj",
        mask=opts.mask,
        lastdb_bin=lastdb_bin,
        dbtype=dbtype,
    )

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir)
    sh(cmd, outfile=lastfile)
    return lastfile
Exemplo n.º 13
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
                         "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
                         "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
                         "gb": ["genome", "nuccore", "nucgss"],
                         "est": ["nucest"],
                         "gss": ["nucgss"],
                         "acc": ["nuccore"],
                         }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest",
                       "nucgss", "protein", "gene")

    p.add_option("--noversion", dest="noversion",
                 default=False, action="store_true",
                 help="Remove trailing accession versions")
    p.add_option("--format", default="fasta", choices=valid_formats,
                 help="download format [default: %default]")
    p.add_option("--database", default="nuccore", choices=valid_databases,
                 help="search database [default: %default]")
    p.add_option("--retmax", default=1000000, type="int",
                 help="how many results to return [default: %default]")
    p.add_option("--skipcheck", default=False, action="store_true",
                 help="turn off prompt to check file existence [default: %default]")
    p.add_option("--batchsize", default=500, type="int",
                 help="download the results in batch for speed-up [default: %default]")
    p.set_outdir(outdir=None)
    p.add_option("--outprefix", default="out",
                 help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert database in allowed_databases[fmt], \
        "For output format '{0}', allowed databases are: {1}".\
        format(fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True,
                       skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax,
                                               rettype=fmt, db=database, batchsize=batchsize,
                                               email=opts.email):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True,
                           skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        print("A total of {0} {1} records downloaded.".
              format(totalsize, fmt.upper()), file=sys.stderr)

    return outfile
Exemplo n.º 14
0
def split(args):
    """
    %prog split barcodefile fastqfile1 ..

    Deconvolute fastq files into subsets of fastq reads, based on the barcodes
    in the barcodefile, which is a two-column file like:
    ID01	AGTCCAG

    Input fastqfiles can be several files. Output files are ID01.fastq,
    ID02.fastq, one file per line in barcodefile.

    When --paired is set, the number of input fastqfiles must be two. Output
    file (the deconvoluted reads) will be in interleaved format.
    """
    p = OptionParser(split.__doc__)
    p.set_outdir(outdir="deconv")
    p.add_option("--nocheckprefix", default=False, action="store_true",
                 help="Don't check shared prefix [default: %default]")
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end data [default: %default]")
    p.add_option("--append", default=False, action="store_true",
                 help="Append barcode to 2nd read [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    barcodefile = args[0]
    fastqfile = args[1:]
    paired = opts.paired
    append = opts.append
    if append:
        assert paired, "--append only works with --paired"

    nfiles = len(fastqfile)

    barcodes = []
    fp = open(barcodefile)
    for row in fp:
        id, seq = row.split()
        for s in unpack_ambiguous(seq):
            barcodes.append(BarcodeLine._make((id, s)))

    nbc = len(barcodes)
    logging.debug("Imported {0} barcodes (ambiguous codes expanded).".format(nbc))
    checkprefix = not opts.nocheckprefix

    if checkprefix:
        # Sanity check of shared prefix
        excludebarcodes = []
        for bc in barcodes:
            exclude = []
            for s in barcodes:
                if bc.id == s.id:
                    continue

                assert bc.seq != s.seq
                if s.seq.startswith(bc.seq) and len(s.seq) > len(bc.seq):
                    logging.error("{0} shares same prefix as {1}.".format(s, bc))
                    exclude.append(s)
            excludebarcodes.append(exclude)
    else:
        excludebarcodes = nbc * [[]]

    outdir = opts.outdir
    mkdir(outdir)

    cpus = opts.cpus
    logging.debug("Create a pool of {0} workers.".format(cpus))
    pool = Pool(cpus)

    if paired:
        assert nfiles == 2, "You asked for --paired, but sent in {0} files".\
                            format(nfiles)
        split_fun = append_barcode_paired if append else split_barcode_paired
        mode = "paired"
    else:
        split_fun = split_barcode
        mode = "single"

    logging.debug("Mode: {0}".format(mode))

    pool.map(split_fun, \
             zip(barcodes, excludebarcodes,
             nbc * [outdir], nbc * [fastqfile]))
Exemplo n.º 15
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang",
                 default=GoodOverhang,
                 type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl,
                    a.component_span,
                    b.component_span,
                    cutoff,
                    qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
Exemplo n.º 16
0
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.add_option("--rnaseq", default=False, action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--native", default=False, action="store_true",
                 help="Convert GSNAP output to NATIVE format")
    p.set_home("eddyyeh")
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[:2]
    outdir = opts.outdir
    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = op.join(outdir, prefix + ".log")
    gsnapfile = op.join(outdir, prefix + ".gsnap")
    nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native"

    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.native:
        EYHOME = opts.eddyyeh_home
        if need_update(gsnapfile, nativefile):
            cmd = op.join(EYHOME, "convert2native.pl")
            cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
            cmd += " -proc {0}".format(opts.cpus)
            sh(cmd)

    return gsnapfile, logfile
Exemplo n.º 17
0
def prepare(args):
    """
    %prog prepare mcscanfile cdsfile [options]

    Pick sequences from cdsfile to form fasta files, according to multiple
    alignment in the mcscanfile.
    The fasta sequences can then be used to construct phylogenetic tree.

    Use --addtandem=tandemfile to collapse tandems of anchors into single row.
    The tandemfile must be provided with *ALL* genomes involved, otherwise
    result will be incomplete and redundant.
    """
    from jcvi.graphics.base import discrete_rainbow

    p = OptionParser(prepare.__doc__)
    p.add_option("--addtandem", help="path to tandemfile [default: %default]")
    p.add_option("--writecolors", default=False, action="store_true", \
        help="generate a gene_name to color mapping file which will be taken " \
        "by jcvi.apps.phylo.draw [default: %default]")
    p.set_outdir(outdir="sequences")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mcscanfile, cdsfile = args

    if opts.addtandem:
        tandemfile = opts.addtandem
        mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile)
        mcscanfile = mcscanfile_with_tandems

    seqdir = opts.outdir
    mkdir(seqdir)
    f = Fasta(cdsfile)
    fp = must_open(mcscanfile)
    if opts.writecolors:
        fc = must_open("leafcolors.txt", "w")

    n = 0
    for i, row in enumerate(fp):
        row = row.strip().split("\t")
        if i == 0:
            l = len(row)
            if l <= 20:
                colors = discrete_rainbow(l, shuffle=False)[1]
            else:
                colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1]
                warnings.warn("*** WARNING ***\n" \
                    "Too many columns. Colors may not be all distinctive.")

        assert len(row)==l, "All rows should have same number of fields."

        anchors = set()
        for j, atom in enumerate(row):
            color = "%s,%s,%s" % colors[j]
            if atom == ".":
                continue
            elif "," in atom:
                atom = atom.split(",")
                for a in atom:
                    fc.write("{0}\t{1}\n".format(a, color))
                    anchors.add(a)
            else:
                fc.write("{0}\t{1}\n".format(atom, color))
                anchors.add(atom)

        if len(anchors) <= 3:
            print("Not enough seqs to build trees for {0}".format(anchors), file=sys.stderr)
            continue

        pivot = row[0]
        fw = must_open("%s/%s.cds" % (seqdir, pivot), "w")
        for a in anchors:
            if a not in f:
                print(a)
                a = find_first_isoform(a, f)
                assert a, a
            arec = f[a]
            SeqIO.write((arec), fw, "fasta")
        fw.close()
        n+=1

    if opts.writecolors:
        fc.close()
        logging.debug("leaf colors written to `{0}`".format(fc.name))

    logging.debug("cds of {0} syntelog groups written to {1}/".format(n, seqdir))

    return seqdir
Exemplo n.º 18
0
def draw(args):
    """
    %prog draw --input newicktrees [options]

    Draw phylogenetic trees into single or combined plots.
    Input trees should be one of the following:
    1.  single Newick format tree file
    2.  a dir containing *ONLY* the tree files to be drawn

    Newick format:
    http://evolution.genetics.washington.edu/phylip/newicktree.html

    This function wraps on jcvi.graphics.tree
    This function is better used for trees generated by jcvi.apps.phylo (rooted
    if possible). For drawing general Newick trees from external sources invoke
    jcvi.graphics.tree directly, which also gives more drawing options.
    """
    trunc_name_options = ['headn', 'oheadn', 'tailn', 'otailn']
    p = OptionParser(draw.__doc__)
    p.add_option("--input", help="path to single input tree file or a dir "\
                 "containing ONLY the input tree files")
    p.add_option("--combine", type="string", default="1x1", \
                 help="combine multiple trees into one plot in nrowxncol")
    p.add_option("--trunc_name", default=None, help="Options are: {0}. " \
                 "truncate first n chars, retains only first n chars, " \
                 "truncate last n chars, retain only last chars. " \
                 "n=1~99. [default: %default]".format(trunc_name_options))
    p.add_option("--SH", default=None,
                 help="path to a file containing SH test p-values in format:" \
                 "tree_file_name<tab>p-values " \
                 "This file can be generated with jcvi.apps.phylo build [default: %default]")
    p.add_option("--scutoff", default=50, type="int",
                 help="cutoff for displaying node support, 0-100 [default: %default]")
    p.add_option("--barcode", default=None,
                 help="path to seq/taxon name barcode mapping file: " \
                 "barcode<tab>new_name " \
                 "This option is downstream of `--trunc_name` [default: %default]")
    p.add_option("--leafcolorfile", default=None,
                 help="path to a mapping file containing font colors " \
                 "for the OTUs: leafname<tab>color [default: %default]")
    p.set_outdir()
    opts, args, iopts = p.set_image_options(figsize="8x6")
    input = opts.input
    outdir = opts.outdir
    combine = opts.combine.split("x")
    trunc_name = opts.trunc_name
    SH = opts.SH

    mkdir(outdir)
    if not input:
        sys.exit(not p.print_help())
    elif op.isfile(input):
        trees_file = input
        treenames = [op.basename(input)]
    elif op.isdir(input):
        trees_file = op.join(outdir, "alltrees.dnd")
        treenames = []
        for f in sorted(os.listdir(input)):
            sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False)
            treenames.append(f)
    else:
        sys.exit(not p.print_help())

    trees = OrderedDict()
    tree = ""
    i = 0
    for row in LineFile(trees_file, comment="#", load=True).lines:
        if i == len(treenames):
            break
        if not len(row):
            continue

        if ";" in row:
            # sanity check
            if row.index(";") != len(row)-1:
                ts = row.split(";")
                for ii in xrange(len(ts)-1):
                    ts[ii] += ";"
            else:
                ts = [row]
            for t in ts:
                if ";" in t:
                    tree += t
                    if tree:
                        trees[treenames[i]] = tree
                        tree = ""
                        i+=1
                else:
                    tree += t
        else:
            tree += row

    logging.debug("A total of {0} trees imported.".format(len(trees)))
    sh("rm {0}".format(op.join(outdir, "alltrees.dnd")))

    _draw_trees(trees, nrow=int(combine[0]), ncol=int(combine[1]), rmargin=.3,\
         iopts=iopts, outdir=outdir, shfile=SH, trunc_name=trunc_name, \
         scutoff=opts.scutoff, barcodefile = opts.barcode,
         leafcolorfile=opts.leafcolorfile)
Exemplo n.º 19
0
def build(args):
    """
    %prog build [prot.fasta] cds.fasta [options] --outdir=outdir

    This function wraps on the following steps:
    1. msa using ClustalW2 or MUSCLE(default)
    2. (optional) alignment editing using Gblocks
    3. build NJ tree using PHYLIP in EMBOSS package
       seq names should be unique by first 10 chars (restriction of PHYLIP)
    4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml,
       *WARNING* maybe slow with large dataset

    If an outgroup file is provided, the result tree will be rooted on the
    outgroup according to order in the file, i.e. the name in row1 will be
    tried first. If not found, row2 will be used, etc.
    Tail truncated names can be provided so long as it is unique among the seqs.
    If not uniq, the first occurrence will be used. For example, if you have
    two moss sequences in your input, then the tree will be rooted on the
    first moss sequence encountered by the program, unless they are monophylic,
     in which case the root will be their common ancestor.

    --stree and --smap are required if --treefix is set.

    Trees can be edited again using an editor such as Dendroscope. This
    is the recommended way to get highly customized trees.

    Newick format trees will be deposited into outdir (. by default).
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(build.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option("--nogblocks", action="store_true",
                 help="don't use Gblocks to edit alignment [default: %default]")
    p.add_option("--synonymous", action="store_true",
                 help="extract synonymous sites of the alignment [default: %default]")
    p.add_option("--fourfold", action="store_true",
                 help="extract fourfold degenerate sites of the alignment [default: %default]")
    p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"),
                 help="software used to align the proteins [default: %default]")
    p.add_option("--noneighbor", action="store_true",
                 help="don't build NJ tree [default: %default]")
    p.add_option("--ml", default=None, choices=("raxml", "phyml"),
                 help="software used to build ML tree [default: %default]")
    p.add_option("--outgroup",
                 help="path to file containing outgroup orders [default: %default]")
    p.add_option("--SH", help="path to reference Newick tree [default: %default]")
    p.add_option("--shout", default="SH_out.txt", \
                 help="SH output file name [default: %default]")
    p.add_option("--treefix", action="store_true",
                 help="use TreeFix to rearrange ML tree [default: %default]")
    p.add_option("--stree", help="path to species Newick tree [default: %default]")
    p.add_option("--smap", help="path to smap file: " \
                    "gene_name_pattern<tab>species_name [default: %default]")
    p.set_outdir()

    opts, args = p.parse_args(args)
    gblocks = not opts.nogblocks
    synonymous = opts.synonymous
    fourfold = opts.fourfold
    neighbor = not opts.noneighbor
    outgroup = opts.outgroup
    outdir = opts.outdir

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    if opts.treefix:
        stree = opts.stree
        smap = opts.smap
        assert stree and smap, "TreeFix requires stree and smap files."
        opts.ml = "raxml"

    treedir = op.join(outdir, "tree")
    mkdir(treedir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    work_dir = op.join(outdir, "alignment")
    mkdir(work_dir)
    p_recs = list(SeqIO.parse(open(protein_file), "fasta"))
    if opts.msa == "clustalw":
        align_fasta = clustal_align_protein(p_recs, work_dir)
    elif opts.msa == "muscle":
        align_fasta = muscle_align_protein(p_recs, work_dir)

    n_recs = list(SeqIO.parse(open(dna_file), "fasta"))
    mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta")

    if not mrtrans_fasta:
        logging.debug("pal2nal aborted. " \
            "Cannot reliably build tree for {0}".format(dna_file))
        return

    codon_aln_fasta = mrtrans_fasta
    if gblocks:
        gb_fasta = run_gblocks(mrtrans_fasta)
        codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta

    else:
        if synonymous:
            codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous")

        if fourfold:
            codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold")

    if not neighbor and not opts.ml:
        return codon_aln_fasta

    alignment = AlignIO.read(codon_aln_fasta, "fasta")
    if len(alignment) <= 3:
        raise ValueError("Too few seqs to build tree.")

    mkdir(op.join(treedir, "work"))
    if neighbor:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".NJ.unrooted.dnd")
        try:
            outfile, phy_file = build_nj_phylip(alignment, \
                outfile=out_file, outgroup=outgroup, work_dir=treedir)
        except:
            print("NJ tree cannot be built for {0}".format(dna_file))

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

    if opts.ml:
        out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \
                ".ML.unrooted.dnd")

        if opts.ml == "phyml":
            try:
                outfile, phy_file = build_ml_phyml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        elif opts.ml == "raxml":
            try:
                outfile, phy_file = build_ml_raxml\
                    (alignment, outfile=out_file, work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        if outgroup:
            new_out_file = out_file.replace(".unrooted", "")
            t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \
                outfile=new_out_file)
            if t == new_out_file:
                sh("rm %s" % out_file)
                outfile = new_out_file

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

        if opts.treefix:
            treefix_dir = op.join(treedir, "treefix")
            assert mkdir(treefix_dir, overwrite=True)

            sh("cp {0} {1}/".format(outfile, treefix_dir))
            input = op.join(treefix_dir, op.basename(outfile))
            aln_file = input.rsplit(".", 1)[0] + ".fasta"
            SeqIO.write(alignment, aln_file, "fasta")

            outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \
                        a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd")

    return outfile