Пример #1
0
def bcf(args):
    """
    %prog bcf fastafile bamfiles > bcffile

    Run mpileup on bam files.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(bcf.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    jargs = [[[x, "--unique"]] for x in unsorted]
    jobs = Jobs(index, args=jargs)
    jobs.run()

    bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
    bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]
    cmd = "samtools mpileup -P ILLUMINA -E -ugDf"
    cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
    cmd += " | bcftools view -bcvg -"
    sh(cmd, outfile=opts.outfile)
Пример #2
0
def bcf(args):
    """
    %prog bcf fastafile bamfiles > bcffile

    Run mpileup on bam files.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(bcf.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    jargs = [[[x, "--unique"]] for x in unsorted]
    jobs = Jobs(index, args=jargs)
    jobs.run()

    bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
    bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]
    cmd = "samtools mpileup -P ILLUMINA -E -ugDf"
    cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
    cmd += " | bcftools view -bcvg -"
    sh(cmd, outfile=opts.outfile)
Пример #3
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species",
                 default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3",
                 default=False,
                 action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(
        augustuswrap,
        species=opts.species,
        gff3=gff3,
        cfgfile=cfgfile,
        hintsfile=opts.hintsfile,
    )
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus

        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Пример #4
0
def vcf(args):
    """
    %prog vcf fastafile bamfiles > out.vcf.gz

    Call SNPs on bam files.
    """
    from jcvi.apps.grid import Jobs

    valid_callers = ("mpileup", "freebayes")
    p = OptionParser(vcf.__doc__)
    p.set_outfile(outfile="out.vcf.gz")
    p.add_option("--nosort",
                 default=False,
                 action="store_true",
                 help="Do not sort the BAM files")
    p.add_option("--caller",
                 default="mpileup",
                 choices=valid_callers,
                 help="Use variant caller")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    caller = opts.caller

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    if opts.nosort:
        bamfiles = unsorted
    else:
        jargs = [[[x, "--unique"]] for x in unsorted]
        jobs = Jobs(index, args=jargs)
        jobs.run()
        bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
        bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]

    if caller == "mpileup":
        cmd = "bcftools mpileup -Ou -f"
        cmd += " {} {}".format(fastafile, " ".join(bamfiles))
        cmd += " | bcftools call -mv -Oz -o {}".format(opts.outfile)
    elif caller == "freebayes":
        cmd = "freebayes -f"
        cmd += " {} {} > {}".format(fastafile, " ".join(bamfiles),
                                    opts.outfile)
    sh(cmd)

    cmd = "bcftools index {}".format(opts.outfile)
    sh(cmd)
Пример #5
0
def split(args):
    """
    %prog split pairs.fastq

    Split shuffled pairs into `.1.fastq` and `.2.fastq`, using `sed`. Can work
    on gzipped file.

    <http://seqanswers.com/forums/showthread.php?t=13776>
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(split.__doc__)
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pairsfastq, = args
    gz = pairsfastq.endswith(".gz")
    pf = pairsfastq.replace(".gz", "").rsplit(".", 1)[0]
    p1 = pf + ".1.fastq"
    p2 = pf + ".2.fastq"

    cmd = "zcat" if gz else "cat"
    p1cmd = cmd + " {0} | sed -ne '1~8{{N;N;N;p}}'".format(pairsfastq)
    p2cmd = cmd + " {0} | sed -ne '5~8{{N;N;N;p}}'".format(pairsfastq)

    if gz:
        p1cmd += " | gzip"
        p2cmd += " | gzip"
        p1 += ".gz"
        p2 += ".gz"

    p1cmd += " > " + p1
    p2cmd += " > " + p2

    if opts.grid:
        sh(p1cmd, grid=True)
        sh(p2cmd, grid=True)

    else:
        args = [(p1cmd, ), (p2cmd, )]
        m = Jobs(target=sh, args=args)
        m.run()

        checkShuffleSizes(p1, p2, pairsfastq)
Пример #6
0
def parallel_musclewrap(clustfile, cpus, minsamp=0):
    musclewrap_minsamp = partial(musclewrap, minsamp=minsamp)
    if cpus == 1:
        return musclewrap_minsamp(clustfile)

    from jcvi.apps.grid import Jobs

    outdir = mkdtemp(dir=".")
    fs = split([clustfile, outdir, str(cpus), "--format=clust"])
    g = Jobs(musclewrap_minsamp, fs.names)
    g.run()

    clustnames = [x.replace(".clust", ".clustS") for x in fs.names]
    clustSfile = clustfile.replace(".clust", ".clustS")
    FileMerger(clustnames, outfile=clustSfile).merge()
    shutil.rmtree(outdir)
Пример #7
0
def mapped(args):
    """
    %prog mapped sam/bamfile

    Given an input sam/bam file, output a sam/bam file containing only the mapped reads.
    Optionally, extract the unmapped reads into a separate file
    """
    import pysam
    from jcvi.apps.grid import Jobs

    p = OptionParser(mapped.__doc__)
    p.set_sam_options(extra=False)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(p.print_help())

    samfile, = args

    view_opts = []
    oext, mopts = (".sam", ["-S"]) \
            if samfile.endswith(".sam") else (".bam", [])

    flag, ext = ("-b", ".bam") if opts.bam else ("-h", ".sam")
    mopts.append(flag)

    if opts.uniq:
        mopts.append("-q1")
        ext = ".uniq{0}".format(ext)

    if opts.unmapped:
        uopts = [x for x in mopts]
        uoutfile = samfile.replace(oext, ".unmapped{0}".format(ext))
        uopts.extend(["-f4", samfile, "-o{0}".format(uoutfile)])
        view_opts.append(uopts)

    outfile = samfile.replace(oext, ".mapped{0}".format(ext))
    mopts.extend(["-F4", samfile, "-o{0}".format(outfile)])
    view_opts.append(mopts)

    for vo in view_opts:
        logging.debug('samtools view {0}'.format(" ".join(vo)))

    jobs = Jobs(pysam.view, [(z for z in x) for x in view_opts])
    jobs.run()
Пример #8
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species", default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3", default=False, action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(augustuswrap, species=opts.species,
                            gff3=gff3, cfgfile=cfgfile,
                            hintsfile=opts.hintsfile)
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus
        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Пример #9
0
def mapped(args):
    """
    %prog mapped sam/bamfile

    Given an input sam/bam file, output a sam/bam file containing only the mapped reads.
    Optionally, extract the unmapped reads into a separate file
    """
    import pysam
    from jcvi.apps.grid import Jobs

    p = OptionParser(mapped.__doc__)
    p.set_sam_options(extra=False)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(p.print_help())

    samfile, = args

    view_opts = []
    oext, mopts = (".sam", ["-S"]) \
            if samfile.endswith(".sam") else (".bam", [])

    flag, ext = ("-b", ".bam") if opts.bam else ("-h", ".sam")
    mopts.append(flag)

    if opts.uniq:
        mopts.append("-q1")
        ext = ".uniq{0}".format(ext)

    if opts.unmapped:
        uopts = [x for x in mopts]
        uoutfile = samfile.replace(oext, ".unmapped{0}".format(ext))
        uopts.extend(["-f4", samfile, "-o{0}".format(uoutfile)])
        view_opts.append(uopts)

    outfile = samfile.replace(oext, ".mapped{0}".format(ext))
    mopts.extend(["-F4", samfile, "-o{0}".format(outfile)])
    view_opts.append(mopts)

    for vo in view_opts:
        logging.debug('samtools view {0}'.format(" ".join(vo)))

    jobs = Jobs(pysam.view, [(z for z in x) for x in view_opts])
    jobs.run()
Пример #10
0
def mdownload(args):
    """
    %prog mdownload links.txt

    Multiple download a list of files. Use formats.html.links() to extract the
    links file.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(mdownload.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    linksfile, = args
    links = [(x.strip(), ) for x in open(linksfile)]
    j = Jobs(download, links)
    j.run()
Пример #11
0
def mdownload(args):
    """
    %prog mdownload links.txt

    Multiple download a list of files. Use formats.html.links() to extract the
    links file.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(mdownload.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    linksfile, = args
    links = [(x.strip(),) for x in open(linksfile)]
    j = Jobs(download, links)
    j.run()
Пример #12
0
def vcf(args):
    """
    %prog vcf fastafile bamfiles > out.vcf.gz

    Call SNPs on bam files.
    """
    from jcvi.apps.grid import Jobs

    valid_callers = ("mpileup", "freebayes")
    p = OptionParser(vcf.__doc__)
    p.set_outfile(outfile="out.vcf.gz")
    p.add_option("--nosort", default=False, action="store_true",
                 help="Do not sort the BAM files")
    p.add_option("--caller", default="mpileup", choices=valid_callers,
                 help="Use variant caller [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    caller = opts.caller

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    if opts.nosort:
        bamfiles = unsorted
    else:
        jargs = [[[x, "--unique"]] for x in unsorted]
        jobs = Jobs(index, args=jargs)
        jobs.run()
        bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
        bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]

    if caller == "mpileup":
        cmd = "samtools mpileup -E -uf"
        cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
        cmd += " | bcftools call -vmO v"
    elif caller == "freebayes":
        cmd = "freebayes -f"
        cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
    sh(cmd, outfile=opts.outfile)
Пример #13
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.set_cpus()
    p.add_option("--nprocs", default=1, type="int",
            help="number of BLAST processes to run in parallel. " + \
            "split query.fa into `nprocs` chunks, " + \
            "each chunk uses -num_threads=`cpus`")
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = op.join(blast_bin, blast_program)

    nprocs, cpus = opts.nprocs, opts.cpus
    if nprocs > 1:
        logging.debug("Dispatch job to %d processes" % nprocs)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(nprocs)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    blast_cmd += " -num_threads {0}".format(cpus)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(out_fh, blast_cmd, query, lock) for query in queries]
    g = Jobs(target=blastplus, args=args)
    g.run()
Пример #14
0
def main(args):
    """
    %prog database.fasta query.fasta


    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """

    supported_formats = ("tab", "maf", "blast")

    p = OptionParser(main.__doc__)
    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--path", help="specify LAST path")
    p.add_option("--format", default="blast", choices=supported_formats,
                 help="Output format, one of {0} [default: %default]".\
                      format("|".join(supported_formats)))
    p.add_option("--eval", default=False, action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")

    set_params(p)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd)

    cmd = "{0} -u 0".format(lastal_bin)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb)

    out_fh = must_open(opts.outfile, "w")
    lock = Lock()

    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Пример #15
0
def main(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--path", help="specify LAST path")
    p.add_option("--mask", default=False, action="store_true",
                 help="invoke -c in lastdb [default: %default]")
    p.add_option("--format", default="blast", choices=supported_formats,
                 help="Output format [default: %default]")
    p.add_option("--eval", default=False, action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    oappend = False
    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd, outfile=opts.outfile)
        oappend = True

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb)

    out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend)

    if out_fh is None:
        return

    lock = Lock()
    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Пример #16
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="Ooutput format [default: %default]")
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")
    p.add_option("--similar", default=False, action="store_true",
            help="Use options tuned for close comparison [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format))

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
            lock, lastz_bin, extra, mask) for k in xrange(cpus)]
    g = Jobs(target=lastz, args=args)
    g.run()
Пример #17
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="Ooutput format [default: %default]")
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")
    p.add_option("--similar", default=False, action="store_true",
            help="Use options tuned for close comparison [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format))

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
            lock, lastz_bin, extra, mask) for k in xrange(cpus)]
    g = Jobs(target=lastz, args=args)
    g.run()
Пример #18
0
def main(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """

    supported_formats = ("tab", "maf", "blast")

    p = OptionParser(main.__doc__)
    p.add_option("--path", help="specify LAST path")
    p.add_option("--mask",
                 default=False,
                 action="store_true",
                 help="invoke -c in lastdb [default: %default]")
    p.add_option("--format",
                 default="blast",
                 choices=supported_formats,
                 help="Output format [default: %default]")
    p.add_option("--eval",
                 default=False,
                 action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    oappend = False
    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd, outfile=opts.outfile)
        oappend = True

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb,
                                                 querydb)

    out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend)

    if out_fh is None:
        return

    lock = Lock()
    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Пример #19
0
def dump(args):
    """
    %prog dump fastbfile

    Export ALLPATHS fastb file to fastq file. Use --dir to indicate a previously
    run allpaths folder.
    """
    p = OptionParser(dump.__doc__)
    p.add_option("--dir",
                help="Working directory [default: %default]")
    p.add_option("--nosim", default=False, action="store_true",
                 help="Do not simulate qual to 50 [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastbfile, = args
    d = opts.dir
    if d:
        from jcvi.assembly.preprocess import export_fastq
        export_fastq(d, fastbfile)
        return

    sim = not opts.nosim
    pf = "j" if "jump" in fastbfile else "f"

    statsfile = "{0}.lib_stats".format(pf)
    if op.exists(statsfile):
        os.remove(statsfile)

    cmd = "SplitReadsByLibrary READS_IN={0}".format(fastbfile)
    cmd += " READS_OUT={0} QUALS=True".format(pf)
    sh(cmd)

    libs = []
    fp = open(statsfile)
    fp.next(); fp.next()  # skip two rows
    for row in fp:
        if row.strip() == "":
            continue

        libname = row.split()[0]
        if libname == "Unpaired":
            continue

        libs.append(libname)

    logging.debug("Found libraries: {0}".format(",".join(libs)))

    cmds = []
    for libname in libs:
        cmd = "FastbQualbToFastq"
        cmd += " HEAD_IN={0}.{1}.AB HEAD_OUT={1}".format(pf, libname)
        cmd += " PAIRED=True PHRED_OFFSET=33"
        if sim:
            cmd += " SIMULATE_QUALS=True"
        if pf == 'j':
            cmd += " FLIP=True"

        cmds.append((cmd, ))

    m = Jobs(target=sh, args=cmds)
    m.run()

    for libname in libs:
        cmd = "mv {0}.A.fastq {0}.1.fastq".format(libname)
        sh(cmd)
        cmd = "mv {0}.B.fastq {0}.2.fastq".format(libname)
        sh(cmd)
Пример #20
0
def movie(args):
    """
    %prog movie test.tour test.clm ref.contigs.last

    Plot optimization history.
    """
    p = OptionParser(movie.__doc__)
    p.add_option("--frames",
                 default=500,
                 type="int",
                 help="Only plot every N frames")
    p.add_option("--engine",
                 default="ffmpeg",
                 choices=("ffmpeg", "gifsicle"),
                 help="Movie engine, output MP4 or GIF")
    p.set_beds()
    opts, args, iopts = p.set_image_options(args,
                                            figsize="16x8",
                                            style="white",
                                            cmap="coolwarm",
                                            format="png",
                                            dpi=300)

    if len(args) != 3:
        sys.exit(not p.print_help())

    tourfile, clmfile, lastfile = args
    tourfile = op.abspath(tourfile)
    clmfile = op.abspath(clmfile)
    lastfile = op.abspath(lastfile)
    cwd = os.getcwd()
    odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie"
    anchorsfile, qbedfile, contig_to_beds = \
                prepare_synteny(tourfile, lastfile, odir, p, opts)

    args = []
    for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames):
        padi = "{:06d}".format(i)
        # Make sure the anchorsfile and bedfile has the serial number in,
        # otherwise parallelization may fail
        a, b = op.basename(anchorsfile).split(".", 1)
        ianchorsfile = a + "_" + padi + "." + b
        symlink(anchorsfile, ianchorsfile)

        # Make BED file with new order
        qb = Bed()
        for contig, o in zip(tour, tour_o):
            if contig not in contig_to_beds:
                continue
            bedlines = contig_to_beds[contig][:]
            if o == '-':
                bedlines.reverse()
            for x in bedlines:
                qb.append(x)

        a, b = op.basename(qbedfile).split(".", 1)
        ibedfile = a + "_" + padi + "." + b
        qb.print_to_file(ibedfile)
        # Plot dot plot, but do not sort contigs by name (otherwise losing
        # order)
        image_name = padi + "." + iopts.format

        tour = ",".join(tour)
        args.append([[
            tour, clmfile, ianchorsfile, "--outfile", image_name, "--label",
            label
        ]])

    Jobs(movieframe, args).run()

    os.chdir(cwd)
    make_movie(odir, odir, engine=opts.engine, format=iopts.format)