Пример #1
0
def dedup(args):
    """
    %prog dedup assembly.assembly.blast assembly.fasta

    Remove duplicate contigs within assembly.
    """
    from jcvi.formats.blast import BlastLine

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=0, pctcov=98)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    cov = opts.pctcov / 100.
    sizes = Sizes(fastafile).mapping
    fp = open(blastfile)
    removed = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        qsize, ssize = sizes[query], sizes[subject]
        qspan = abs(b.qstop - b.qstart)
        if qspan < qsize * cov:
            continue
        if (qsize, query) < (ssize, subject):
            removed.add(query)

    print "\n".join(sorted(removed))
Пример #2
0
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=0, evalue=.01)
    p.add_option("--wordsize", type="int", help="Word size [default: %default]")
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.add_option("--task", default="megablast", choices=task_choices,
            help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta)

    run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta,
                  wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue,
                  hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus)

    return blastfile
Пример #3
0
def dedup(args):
    """
    %prog dedup assembly.assembly.blast assembly.fasta

    Remove duplicate contigs within assembly.
    """
    from jcvi.formats.blast import BlastLine

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=0, pctcov=98)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    cov = opts.pctcov / 100.0
    sizes = Sizes(fastafile).mapping
    fp = open(blastfile)
    removed = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        qsize, ssize = sizes[query], sizes[subject]
        qspan = abs(b.qstop - b.qstart)
        if qspan < qsize * cov:
            continue
        if (qsize, query) < (ssize, subject):
            removed.add(query)

    print("\n".join(sorted(removed)))
Пример #4
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    p.set_align(pctid=94, hitlen=500)
    p.add_option("--hang", default=500, type="int",
                 help="Maximum overhang length")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        if b.query == b.subject:
            continue
        ov = Overlap(b, asize, bsize, cutoff)
        if ov.otype:
            ov.print_graphic()
            print "{0}\t{1}".format(b, Overlap_types[ov.otype])
Пример #5
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted." + sf
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -output {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    consensusfastafile = pf + ".consensus.fasta"
    if need_update(sortedfastafile, consensusfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        #cmd += " -strand both"
        cmd += " -uc {0} -consout {1}".format(clstrfile, consensusfastafile)
        sh(cmd)
Пример #6
0
def main(args):
    """
    %prog deltafile refidsfile query.fasta ref.fasta

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.set_align(pctid=96, hitlen=500)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    deltafile, refidsfile, queryfasta, reffasta = args
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping
    refs = SetFile(refidsfile)
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
Пример #7
0
def blat(args):
    """
    %prog blat ref.fasta query.fasta

    Calls blat and filters BLAST hits.
    """
    p = OptionParser(blat.__doc__)
    p.set_align(pctid=95, hitlen=30)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta, suffix="blat")

    run_blat(infile=queryfasta,
             outfile=blastfile,
             db=reffasta,
             pctid=opts.pctid,
             hitlen=opts.hitlen,
             cpus=opts.cpus,
             overwrite=False)

    return blastfile
Пример #8
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    p.set_align(pctid=94, hitlen=500)
    p.add_option("--hang",
                 default=500,
                 type="int",
                 help="Maximum overhang length")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = must_open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        if b.query == b.subject:
            continue
        ov = Overlap(b, asize, bsize, cutoff)
        if ov.otype:
            ov.print_graphic()
            print "{0}\t{1}".format(b, Overlap_types[ov.otype])
Пример #9
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Пример #10
0
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=None, evalue=.01)
    p.add_option("--wordsize", type="int", help="Word size [default: %default]")
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.add_option("--task", default="megablast", choices=task_choices,
            help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    q = op.basename(queryfasta).split(".")[0]
    r = op.basename(reffasta).split(".")[0]
    blastfile = "{0}.{1}.blast".format(q, r)

    run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta,
                  wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue,
                  hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus)

    return blastfile
Пример #11
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--fast",
                 default=False,
                 action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus",
                 default=False,
                 action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads",
                 default=False,
                 action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand",
                 default=False,
                 action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Пример #12
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment [%default: %default]")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Пример #13
0
def blat(args):
    """
    %prog blat ref.fasta query.fasta

    Calls blat and filters BLAST hits.
    """
    p = OptionParser(blat.__doc__)
    p.set_align(pctid=95, hitlen=30)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta, suffix="blat")

    run_blat(
        infile=queryfasta,
        outfile=blastfile,
        db=reffasta,
        pctid=opts.pctid,
        hitlen=opts.hitlen,
        cpus=opts.cpus,
        overwrite=False,
    )

    return blastfile
Пример #14
0
def filter(args):
    """
    %prog filter <deltafile|coordsfile>

    Produce a new delta/coords file and filter based on id% or cov%.
    Use `delta-filter` for .delta file.
    """
    p = OptionParser(filter.__doc__)
    p.set_align(pctid=0, hitlen=0)
    p.add_option("--overlap",
                 default=False,
                 action="store_true",
                 help="Print overlap status (e.g. terminal, contained)")

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    pctid = opts.pctid
    hitlen = opts.hitlen

    filename, = args
    if pctid == 0 and hitlen == 0:
        return filename

    pf, suffix = filename.rsplit(".", 1)
    outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid),
                                               int(hitlen)), suffix))
    if not need_update(filename, outfile):
        return outfile

    if suffix == "delta":
        cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename)
        sh(cmd, outfile=outfile)
        return outfile

    fp = open(filename)
    fw = must_open(outfile, "w")
    for row in fp:
        try:
            c = CoordsLine(row)
        except AssertionError:
            continue

        if c.identity < pctid:
            continue
        if c.len2 < hitlen:
            continue
        if opts.overlap and not c.overlap:
            continue

        outrow = row.rstrip()
        if opts.overlap:
            ov = Overlap_types[c.overlap]
            outrow += "\t" + ov
        print >> fw, outrow

    return outfile
Пример #15
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps(
        [scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
Пример #16
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
Пример #17
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + [
        "--seqtk",
        "--outdir={0}".format(opts.outdir),
        "--outfile={0}".format(prefix + ".fasta"),
    ])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid,
                         cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile,
                  userfile,
                  notmatchedfile,
                  clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Пример #18
0
def filter(args):
    """
    %prog filter <deltafile|coordsfile>

    Produce a new delta/coords file and filter based on id% or cov%.
    Use `delta-filter` for .delta file.
    """
    p = OptionParser(filter.__doc__)
    p.set_align(pctid=0, hitlen=0)
    p.add_option("--overlap", default=False, action="store_true",
            help="Print overlap status (e.g. terminal, contained)")

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    pctid = opts.pctid
    hitlen = opts.hitlen

    filename, = args
    if pctid == 0 and hitlen == 0:
        return filename

    pf, suffix = filename.rsplit(".", 1)
    outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix))
    if not need_update(filename, outfile):
        return outfile

    if suffix == "delta":
        cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename)
        sh(cmd, outfile=outfile)
        return outfile

    fp = open(filename)
    fw = must_open(outfile, "w")
    for row in fp:
        try:
            c = CoordsLine(row)
        except AssertionError:
            continue

        if c.identity < pctid:
            continue
        if c.len2 < hitlen:
            continue
        if opts.overlap and not c.overlap:
            continue

        outrow = row.rstrip()
        if opts.overlap:
            ov = Overlap_types[c.overlap]
            outrow += "\t" + ov
        print >> fw, outrow

    return outfile
Пример #19
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option("--fast", default=False, action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus", default=False, action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Пример #20
0
def tandem(args):
    """
    %prog tandem blast_file cds_file bed_file [options]

    Find tandem gene clusters that are separated by N genes, based on filtered
    blast_file by enforcing alignments between any two genes at least 50%
    (or user specified value) of either gene.

    pep_file can also be used in same manner.
    """
    p = OptionParser(tandem.__doc__)
    p.add_option("--tandem_Nmax",
                 dest="tandem_Nmax",
                 type="int",
                 default=3,
                 help="merge tandem genes within distance [default: %default]")
    p.add_option("--percent_overlap",
                 type="int",
                 default=50,
                 help="tandem genes have >=x% aligned sequence, x=0-100 \
               [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--not_self",
                 default=False,
                 action="store_true",
                 help="provided is not self blast file [default: %default]")
    p.add_option("--strip_gene_name",
                 dest="sep",
                 type="string",
                 default=".",
                 help="strip alternative splicing. Use None for no stripping. \
               [default: %default]")
    p.add_option(
        "--genefamily",
        dest="genefam",
        action="store_true",
        help="compile gene families based on similarity [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blast_file, cds_file, bed_file = args
    N = opts.tandem_Nmax
    P = opts.percent_overlap
    is_self = not opts.not_self
    sep = opts.sep
    ofile = opts.outfile

    tandem_main(blast_file, cds_file, bed_file, N=N, P=P, is_self=is_self, \
        evalue=opts.evalue, strip_name=sep, ofile=ofile, genefam=opts.genefam)
Пример #21
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + ["--seqtk",
                                "--outdir={0}".format(opts.outdir),
                                "--outfile={0}".format(prefix + ".fasta")])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile, userfile, notmatchedfile, clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Пример #22
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color", default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout", default=False, action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov,
                                        prefix=prefix, color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes,
                          deltafile, refcov,
                          prefix=prefix, color=color, layout=layout)
Пример #23
0
def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color", default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout", default=False, action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov,
                                        prefix=prefix, color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes,
                          deltafile, refcov,
                          prefix=prefix, color=color, layout=layout)
Пример #24
0
def main(args):
    """
    %prog deltafile refidsfile query.fasta ref.fasta

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refcov",
                 default=.01,
                 type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option(
        "--all",
        default=False,
        action="store_true",
        help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.set_align(pctid=96, hitlen=500)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    deltafile, refidsfile, queryfasta, reffasta = args
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping
    refs = SetFile(refidsfile)
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([
        deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)
    ])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
Пример #25
0
def tandem(args):
    """
    %prog tandem blast_file cds_file bed_file [options]

    Find tandem gene clusters that are separated by N genes, based on filtered
    blast_file by enforcing alignments between any two genes at least 50%
    (or user specified value) of either gene.

    pep_file can also be used in same manner.
    """
    p = OptionParser(tandem.__doc__)
    p.add_option("--tandem_Nmax", dest="tandem_Nmax", type="int", default=3,
               help="merge tandem genes within distance [default: %default]")
    p.add_option("--percent_overlap", type="int", default=50,
               help="tandem genes have >=x% aligned sequence, x=0-100 \
               [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--not_self", default=False, action="store_true",
                 help="provided is not self blast file [default: %default]")
    p.add_option("--strip_gene_name", dest="sep", type="string", default=".",
               help="strip alternative splicing. Use None for no stripping. \
               [default: %default]")
    p.add_option("--genefamily", dest="genefam", action="store_true",
                 help="compile gene families based on similarity [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blast_file, cds_file, bed_file = args
    N = opts.tandem_Nmax
    P = opts.percent_overlap
    is_self = not opts.not_self
    sep = opts.sep
    ofile = opts.outfile

    tandem_main(blast_file, cds_file, bed_file, N=N, P=P, is_self=is_self, \
        evalue=opts.evalue, strip_name=sep, ofile=ofile, genefam=opts.genefam)
Пример #26
0
def screen(args):
    """
    %prog screen scaffolds.fasta library.fasta

    Screen sequences against FASTA library. Sequences that have 95% id and 50%
    cov will be removed by default.
    """
    from jcvi.apps.align import blast
    from jcvi.formats.blast import covfilter

    p = OptionParser(screen.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--best",
                 default=1,
                 type="int",
                 help="Get the best N hit [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    scaffolds, library = args
    pctidflag = "--pctid={0}".format(opts.pctid)
    blastfile = blast(
        [library, scaffolds, pctidflag, "--best={0}".format(opts.best)])

    idsfile = blastfile.rsplit(".", 1)[0] + ".ids"
    covfilter([
        blastfile, scaffolds, "--union", "--ids=" + idsfile, pctidflag,
        "--pctcov={0}".format(opts.pctcov)
    ])

    pf = scaffolds.rsplit(".", 1)[0]
    nf = pf + ".screen.fasta"
    cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf)
    sh(cmd)

    logging.debug("Screened FASTA written to `{0}`.".format(nf))

    return nf
Пример #27
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads",
                 default=False,
                 action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand",
                 default=False,
                 action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Пример #28
0
def screen(args):
    """
    %prog screen scaffolds.fasta library.fasta

    Screen sequences against FASTA library. Sequences that have 95% id and 50%
    cov will be removed by default.
    """
    from jcvi.apps.align import blast
    from jcvi.formats.blast import covfilter

    p = OptionParser(screen.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--best", default=1, type="int",
            help="Get the best N hit [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    scaffolds, library = args
    pctidflag = "--pctid={0}".format(opts.pctid)
    blastfile = blast([library, scaffolds, pctidflag,
               "--best={0}".format(opts.best)])

    idsfile = blastfile.rsplit(".", 1)[0] + ".ids"
    covfilter([blastfile, scaffolds, "--ids=" + idsfile,
               pctidflag, "--pctcov={0}".format(opts.pctcov)])

    pf = scaffolds.rsplit(".", 1)[0]
    nf = pf + ".screen.fasta"
    cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf)
    sh(cmd)

    logging.debug("Screened FASTA written to `{0}`.".format(nf))

    return nf
Пример #29
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang", default=GoodOverhang, type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl, a.component_span, b.component_span,
                        cutoff, qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta
Пример #30
0
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology", choices=("illumina", "454", "iontorrent"),
                 default="iontorrent", help="Sequencing platform")
    p.add_option("--dedup", choices=("uclust", "cdhit"),
                 default="cdhit", help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit", default="/usr/local/bin/")
    p.set_home("fiona", default="/usr/local/bin/")
    p.set_home("jellyfish", default="/usr/local/bin/")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([diginormfile, "--prefix={0}".format(pf),
                    "--cpus={0}".format(cpus),
                    "--jellyfish_home={0}".format(opts.jellyfish_home)])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([fiona, "--consensus", "--reads",
                         "--pctid={0}".format(pctid),
                         "--cdhit_home={0}".format(opts.cdhit_home)])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([cons, "--outfile={0}".format(covfile),
                      "--minsize={0}".format(depth / 5)])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([filteredfile, finalfile, "--sequential=replace",
                    "--prefix={0}_".format(pf)])
Пример #31
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=95)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()
Пример #32
0
def novo(args):
    """
    %prog novo reads.fastq

    Reference-free tGBS pipeline.
    """
    from jcvi.assembly.kmer import jellyfish, histogram
    from jcvi.assembly.preprocess import diginorm
    from jcvi.formats.fasta import filter as fasta_filter, format
    from jcvi.apps.cdhit import filter as cdhit_filter

    p = OptionParser(novo.__doc__)
    p.add_option("--technology",
                 choices=("illumina", "454", "iontorrent"),
                 default="iontorrent",
                 help="Sequencing platform")
    p.add_option("--dedup",
                 choices=("uclust", "cdhit"),
                 default="cdhit",
                 help="Dedup algorithm")
    p.set_depth(depth=50)
    p.set_align(pctid=96)
    p.set_home("cdhit")
    p.set_home("fiona")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    cpus = opts.cpus
    depth = opts.depth
    pf, sf = fastqfile.rsplit(".", 1)

    diginormfile = pf + ".diginorm." + sf
    if need_update(fastqfile, diginormfile):
        diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
        keepabund = fastqfile + ".keep.abundfilt"
        sh("cp -s {0} {1}".format(keepabund, diginormfile))

    jf = pf + "-K23.histogram"
    if need_update(diginormfile, jf):
        jellyfish([
            diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus)
        ])

    genomesize = histogram([jf, pf, "23"])
    fiona = pf + ".fiona.fa"
    if need_update(diginormfile, fiona):
        cmd = op.join(opts.fiona_home, "bin/fiona")
        cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
                    format(genomesize, cpus, opts.technology)
        cmd += " -vv {0} {1}".format(diginormfile, fiona)
        logfile = pf + ".fiona.log"
        sh(cmd, outfile=logfile, errfile=logfile)

    dedup = opts.dedup
    pctid = opts.pctid
    cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup)
    if need_update(fiona, cons):
        if dedup == "cdhit":
            deduplicate([
                fiona, "--consensus", "--reads", "--pctid={0}".format(pctid),
                "--cdhit_home={0}".format(opts.cdhit_home)
            ])
        else:
            uclust([fiona, "--pctid={0}".format(pctid)])

    filteredfile = pf + ".filtered.fasta"
    if need_update(cons, filteredfile):
        covfile = pf + ".cov.fasta"
        cdhit_filter([
            cons, "--outfile={0}".format(covfile),
            "--minsize={0}".format(depth / 5)
        ])
        fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)])

    finalfile = pf + ".final.fasta"
    if need_update(filteredfile, finalfile):
        format([
            filteredfile, finalfile, "--sequential=replace",
            "--prefix={0}_".format(pf)
        ])
Пример #33
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.set_cpus()
    p.add_option("--nprocs", default=1, type="int",
            help="number of BLAST processes to run in parallel. " + \
            "split query.fa into `nprocs` chunks, " + \
            "each chunk uses -num_threads=`cpus`")
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = op.join(blast_bin, blast_program)

    nprocs, cpus = opts.nprocs, opts.cpus
    if nprocs > 1:
        logging.debug("Dispatch job to %d processes" % nprocs)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(nprocs)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    blast_cmd += " -num_threads {0}".format(cpus)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(out_fh, blast_cmd, query, lock) for query in queries]
    g = Jobs(target=blastplus, args=args)
    g.run()
Пример #34
0
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - self: non-self
    - ids: valid ids

    use --inverse to obtain the complementary records
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score", dest="score", default=0, type="int",
            help="Score cutoff [default: %default]")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--self", default=None, choices=("strict", "loose"),
            help="Remove self hits. strict: matched names and spans; " \
            "loose: matched names [default: %default]")
    p.add_option("--ids", default=None,
            help="path to tab or comma delimited file containing ids to " \
            "retain [default: %default]")
    p.add_option("--inverse", action="store_true",
            help="Similar to grep -v, inverse [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, selfrule = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.self
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            c.is_self_hit(selfrule) or \
            noids

        if inverse:
            remove = not remove

        if not remove:
            print >> fw, row.rstrip()

    return newblastfile
Пример #35
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Пример #36
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=94)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()
Пример #37
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided,
    the PASA Comprehensive Transcriptome protocol is followed
    <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_home("pasa")
    p.set_align(pctid=95, pctcov=90, intron=2000, bpsplice=3, compreh_pctcov=30)
    p.add_option("--aligners", default="blat,gmap",
            help="Specify splice aligners to use for mapping [default: %default]")
    p.add_option("--clean", default=False, action="store_true",
            help="Clean transcripts using tgi seqclean [default: %default]")
    p.set_cpus()
    p.set_grid()
    p.set_grid_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = which("seqclean")
    if clean and not seqclean:
        logging.error("Cannot find tgi seqclean in PATH")
        sys.exit()

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", "build_comprehensive_transcriptome.dbi"))

    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    mkdir(pasa_db)
    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        write_file(runfile, accn_extract_cmd, append=True) \
                if prepare else sh(accn_extract_cmd)
    else:
        transcripts = dnfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus)
        if prepare:
            write_file(runfile, cleancmd, append=True)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), pctcov, pctid, bpsplice)
    aafw.close()

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome)
    aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \
             " -t {0} ".format(transcripts)
    if ggfasta:
        aacmd += " --TDN {0} ".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1}".format(",".join(aligners), opts.intron)

    if prepare:
        write_file(runfile, aacmd, append=True)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += "--min_per_ID {0} --min_per_aligned {1}".format(pctid, pctcov)

        if prepare:
            write_file(runfile, comprehcmd, append=True)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
Пример #38
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov", default=False, action="store_true",
            help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap", action="store_true",
            help="Use supermap instead of union")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby,
            help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Пример #39
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--suffix",
                 default="fasta",
                 help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
Пример #40
0
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - ids: valid ids

    Use --inverse to obtain the complementary records for the criteria above.

    - noself: remove self-self hits
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score",
                 dest="score",
                 default=0,
                 type="int",
                 help="Score cutoff")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--noself",
                 default=False,
                 action="store_true",
                 help="Remove self-self hits")
    p.add_option("--ids", help="Path to file with ids to retain")
    p.add_option("--inverse",
                 default=False,
                 action="store_true",
                 help="Similar to grep -v, inverse")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, noself = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    if inverse:
        newblastfile += ".inverse"
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            noids

        if inverse:
            remove = not remove

        remove = remove or (noself and c.query == c.subject)

        if not remove:
            print >> fw, row.rstrip()

    fw.close()

    return newblastfile
Пример #41
0
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - ids: valid ids

    Use --inverse to obtain the complementary records for the criteria above.

    - noself: remove self-self hits
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score", dest="score", default=0, type="int",
                 help="Score cutoff")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--noself", default=False, action="store_true",
                 help="Remove self-self hits")
    p.add_option("--ids", help="Path to file with ids to retain")
    p.add_option("--inverse", default=False, action="store_true",
                 help="Similar to grep -v, inverse")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, noself = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    if inverse:
        newblastfile += ".inverse"
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            noids

        if inverse:
            remove = not remove

        remove = remove or (noself and c.query == c.subject)

        if not remove:
            print >> fw, row.rstrip()

    return newblastfile
Пример #42
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir", default=os.getcwd(),
            help="Download sequences to dir [default: %default]")
    p.add_option("--suffix", default="fasta",
            help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse", default=False, action="store_true",
            help="Reverse seq a [default: %default]")
    p.add_option("--nochain", default=False, action="store_true",
            help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o
Пример #43
0
def anneal(args):
    """
    %prog anneal agpfile contigs.fasta

    Merge adjacent overlapping contigs and make new AGP file.

    By default it will also anneal lines like these together (unless --nozipshreds):
    scaffold4       1       1608    1       W       ca-bacs.5638.frag11.22000-23608 1       1608    -
    scaffold4       1609    1771    2       N       163     scaffold        yes     paired-ends
    scaffold4       1772    3771    3       W       ca-bacs.5638.frag10.20000-22000 1       2000    -

    These are most likely shreds, which we look for based on names.
    """
    p = OptionParser(anneal.__doc__)
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap)
    p.add_option("--hang",
                 default=GoodOverhang,
                 type="int",
                 help="Maximum overhang length [default: %default]")
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    agpfile, contigs = args
    outdir = opts.outdir
    if not op.exists(outdir):
        mkdir(outdir)
        cmd = "faSplit byname {0} {1}/".format(contigs, outdir)
        sh(cmd)

    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))

    agp = AGP(agpfile)
    blastfile = agpfile.replace(".agp", ".blast")
    if not op.exists(blastfile):
        populate_blastfile(blastfile, agp, outdir, opts)

    assert op.exists(blastfile)
    logging.debug("File `{0}` found. Start loading.".format(blastfile))
    blast = BlastSlow(blastfile).to_dict()

    annealedagp = "annealed.agp"
    annealedfasta = "annealed.fasta"

    newagp = deepcopy(agp)
    clrstore = {}
    for a, b, qreverse in agp.iter_paired_components():
        aid = a.component_id
        bid = b.component_id

        pair = (aid, bid)
        if pair in blast:
            bl = blast[pair]
        else:
            oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts)
            o = overlap(oopts)
            if not o:
                continue
            bl = o.blastline

        o = Overlap(bl,
                    a.component_span,
                    b.component_span,
                    cutoff,
                    qreverse=qreverse)

        if aid not in clrstore:
            clrstore[aid] = CLR.from_agpline(a)
        if bid not in clrstore:
            clrstore[bid] = CLR.from_agpline(b)

        aclr, bclr = clrstore[aid], clrstore[bid]

        o.print_graphic()
        if o.anneal(aclr, bclr):
            newagp.delete_between(aid, bid, verbose=True)

        if o.otype == 2:  # b ~ a
            o = o.swapped
            o.print_graphic()
            if o.anneal(bclr, aclr):
                newagp.switch_between(bid, aid, verbose=True)
                newagp.delete_between(bid, aid, verbose=True)

    logging.debug("A total of {0} components with modified CLR.".\
                    format(len(clrstore)))

    for cid, c in clrstore.items():
        if c.is_valid:
            continue
        print >> sys.stderr, "Remove {0}".format(c)
        newagp.convert_to_gap(cid, verbose=True)

    # Update all ranges that has modified clr
    for a in newagp:
        if a.is_gap:
            continue
        aid = a.component_id
        if aid in clrstore:
            c = clrstore[aid]
            a.component_beg = c.start
            a.component_end = c.end

    newagp.print_to_file(annealedagp)
    tidyagp = tidy([annealedagp, contigs])

    build([tidyagp, contigs, annealedfasta])
    return annealedfasta