def dedup(args): """ %prog dedup assembly.assembly.blast assembly.fasta Remove duplicate contigs within assembly. """ from jcvi.formats.blast import BlastLine p = OptionParser(dedup.__doc__) p.set_align(pctid=0, pctcov=98) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args cov = opts.pctcov / 100. sizes = Sizes(fastafile).mapping fp = open(blastfile) removed = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query == subject: continue qsize, ssize = sizes[query], sizes[subject] qspan = abs(b.qstop - b.qstart) if qspan < qsize * cov: continue if (qsize, query) < (ssize, subject): removed.add(query) print "\n".join(sorted(removed))
def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=0, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def dedup(args): """ %prog dedup assembly.assembly.blast assembly.fasta Remove duplicate contigs within assembly. """ from jcvi.formats.blast import BlastLine p = OptionParser(dedup.__doc__) p.set_align(pctid=0, pctcov=98) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args cov = opts.pctcov / 100.0 sizes = Sizes(fastafile).mapping fp = open(blastfile) removed = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query == subject: continue qsize, ssize = sizes[query], sizes[subject] qspan = abs(b.qstop - b.qstart) if qspan < qsize * cov: continue if (qsize, query) < (ssize, subject): removed.add(query) print("\n".join(sorted(removed)))
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types p = OptionParser(annotate.__doc__) p.set_align(pctid=94, hitlen=500) p.add_option("--hang", default=500, type="int", help="Maximum overhang length") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] if b.query == b.subject: continue ov = Overlap(b, asize, bsize, cutoff) if ov.otype: ov.print_graphic() print "{0}\t{1}".format(b, Overlap_types[ov.otype])
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted." + sf if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -output {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" consensusfastafile = pf + ".consensus.fasta" if need_update(sortedfastafile, consensusfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) #cmd += " -strand both" cmd += " -uc {0} -consout {1}".format(clstrfile, consensusfastafile) sh(cmd)
def main(args): """ %prog deltafile refidsfile query.fasta ref.fasta Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.set_align(pctid=96, hitlen=500) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) deltafile, refidsfile, queryfasta, reffasta = args qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(refidsfile) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
def blat(args): """ %prog blat ref.fasta query.fasta Calls blat and filters BLAST hits. """ p = OptionParser(blat.__doc__) p.set_align(pctid=95, hitlen=30) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta, suffix="blat") run_blat(infile=queryfasta, outfile=blastfile, db=reffasta, pctid=opts.pctid, hitlen=opts.hitlen, cpus=opts.cpus, overwrite=False) return blastfile
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types p = OptionParser(annotate.__doc__) p.set_align(pctid=94, hitlen=500) p.add_option("--hang", default=500, type="int", help="Maximum overhang length") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = must_open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] if b.query == b.subject: continue ov = Overlap(b, asize, bsize, cutoff) if ov.otype: ov.print_graphic() print "{0}\t{1}".format(b, Overlap_types[ov.otype])
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=None, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args q = op.basename(queryfasta).split(".")[0] r = op.basename(reffasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment [%default: %default]") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def blat(args): """ %prog blat ref.fasta query.fasta Calls blat and filters BLAST hits. """ p = OptionParser(blat.__doc__) p.set_align(pctid=95, hitlen=30) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta, suffix="blat") run_blat( infile=queryfasta, outfile=blastfile, db=reffasta, pctid=opts.pctid, hitlen=opts.hitlen, cpus=opts.cpus, overwrite=False, ) return blastfile
def filter(args): """ %prog filter <deltafile|coordsfile> Produce a new delta/coords file and filter based on id% or cov%. Use `delta-filter` for .delta file. """ p = OptionParser(filter.__doc__) p.set_align(pctid=0, hitlen=0) p.add_option("--overlap", default=False, action="store_true", help="Print overlap status (e.g. terminal, contained)") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pctid = opts.pctid hitlen = opts.hitlen filename, = args if pctid == 0 and hitlen == 0: return filename pf, suffix = filename.rsplit(".", 1) outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix)) if not need_update(filename, outfile): return outfile if suffix == "delta": cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename) sh(cmd, outfile=outfile) return outfile fp = open(filename) fw = must_open(outfile, "w") for row in fp: try: c = CoordsLine(row) except AssertionError: continue if c.identity < pctid: continue if c.len2 < hitlen: continue if opts.overlap and not c.overlap: continue outrow = row.rstrip() if opts.overlap: ov = Overlap_types[c.overlap] outrow += "\t" + ov print >> fw, outrow return outfile
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + [ "--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta"), ]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def tandem(args): """ %prog tandem blast_file cds_file bed_file [options] Find tandem gene clusters that are separated by N genes, based on filtered blast_file by enforcing alignments between any two genes at least 50% (or user specified value) of either gene. pep_file can also be used in same manner. """ p = OptionParser(tandem.__doc__) p.add_option("--tandem_Nmax", dest="tandem_Nmax", type="int", default=3, help="merge tandem genes within distance [default: %default]") p.add_option("--percent_overlap", type="int", default=50, help="tandem genes have >=x% aligned sequence, x=0-100 \ [default: %default]") p.set_align(evalue=.01) p.add_option("--not_self", default=False, action="store_true", help="provided is not self blast file [default: %default]") p.add_option("--strip_gene_name", dest="sep", type="string", default=".", help="strip alternative splicing. Use None for no stripping. \ [default: %default]") p.add_option( "--genefamily", dest="genefam", action="store_true", help="compile gene families based on similarity [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blast_file, cds_file, bed_file = args N = opts.tandem_Nmax P = opts.percent_overlap is_self = not opts.not_self sep = opts.sep ofile = opts.outfile tandem_main(blast_file, cds_file, bed_file, N=N, P=P, is_self=is_self, \ evalue=opts.evalue, strip_name=sep, ofile=ofile, genefam=opts.genefam)
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + ["--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta")]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.add_option("--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on") p.add_option("--nolayout", default=False, action="store_true", help="Do not rearrange contigs") p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout)
def main(args): """ %prog deltafile refidsfile query.fasta ref.fasta Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option( "--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.set_align(pctid=96, hitlen=500) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) deltafile, refidsfile, queryfasta, reffasta = args qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(refidsfile) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([ deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen) ]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
def tandem(args): """ %prog tandem blast_file cds_file bed_file [options] Find tandem gene clusters that are separated by N genes, based on filtered blast_file by enforcing alignments between any two genes at least 50% (or user specified value) of either gene. pep_file can also be used in same manner. """ p = OptionParser(tandem.__doc__) p.add_option("--tandem_Nmax", dest="tandem_Nmax", type="int", default=3, help="merge tandem genes within distance [default: %default]") p.add_option("--percent_overlap", type="int", default=50, help="tandem genes have >=x% aligned sequence, x=0-100 \ [default: %default]") p.set_align(evalue=.01) p.add_option("--not_self", default=False, action="store_true", help="provided is not self blast file [default: %default]") p.add_option("--strip_gene_name", dest="sep", type="string", default=".", help="strip alternative splicing. Use None for no stripping. \ [default: %default]") p.add_option("--genefamily", dest="genefam", action="store_true", help="compile gene families based on similarity [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blast_file, cds_file, bed_file = args N = opts.tandem_Nmax P = opts.percent_overlap is_self = not opts.not_self sep = opts.sep ofile = opts.outfile tandem_main(blast_file, cds_file, bed_file, N=N, P=P, is_self=is_self, \ evalue=opts.evalue, strip_name=sep, ofile=ofile, genefam=opts.genefam)
def screen(args): """ %prog screen scaffolds.fasta library.fasta Screen sequences against FASTA library. Sequences that have 95% id and 50% cov will be removed by default. """ from jcvi.apps.align import blast from jcvi.formats.blast import covfilter p = OptionParser(screen.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--best", default=1, type="int", help="Get the best N hit [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) scaffolds, library = args pctidflag = "--pctid={0}".format(opts.pctid) blastfile = blast( [library, scaffolds, pctidflag, "--best={0}".format(opts.best)]) idsfile = blastfile.rsplit(".", 1)[0] + ".ids" covfilter([ blastfile, scaffolds, "--union", "--ids=" + idsfile, pctidflag, "--pctcov={0}".format(opts.pctcov) ]) pf = scaffolds.rsplit(".", 1)[0] nf = pf + ".screen.fasta" cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf) sh(cmd) logging.debug("Screened FASTA written to `{0}`.".format(nf)) return nf
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def screen(args): """ %prog screen scaffolds.fasta library.fasta Screen sequences against FASTA library. Sequences that have 95% id and 50% cov will be removed by default. """ from jcvi.apps.align import blast from jcvi.formats.blast import covfilter p = OptionParser(screen.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--best", default=1, type="int", help="Get the best N hit [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) scaffolds, library = args pctidflag = "--pctid={0}".format(opts.pctid) blastfile = blast([library, scaffolds, pctidflag, "--best={0}".format(opts.best)]) idsfile = blastfile.rsplit(".", 1)[0] + ".ids" covfilter([blastfile, scaffolds, "--ids=" + idsfile, pctidflag, "--pctcov={0}".format(opts.pctcov)]) pf = scaffolds.rsplit(".", 1)[0] nf = pf + ".screen.fasta" cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf) sh(cmd) logging.debug("Screened FASTA written to `{0}`.".format(nf)) return nf
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length [default: %default]") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".\ format(len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print >> sys.stderr, "Remove {0}".format(c) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit", default="/usr/local/bin/") p.set_home("fiona", default="/usr/local/bin/") p.set_home("jellyfish", default="/usr/local/bin/") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus), "--jellyfish_home={0}".format(opts.jellyfish_home)]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home)]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5)]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf)])
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=95) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit") p.set_home("fiona") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([ diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus) ]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "bin/fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([ fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home) ]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([ cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5) ]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([ filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf) ])
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.add_option("--nprocs", default=1, type="int", help="number of BLAST processes to run in parallel. " + \ "split query.fa into `nprocs` chunks, " + \ "each chunk uses -num_threads=`cpus`") p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = op.join(blast_bin, blast_program) nprocs, cpus = opts.nprocs, opts.cpus if nprocs > 1: logging.debug("Dispatch job to %d processes" % nprocs) outdir = "outdir" fs = split([afasta_fn, outdir, str(nprocs)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) blast_cmd += " -num_threads {0}".format(cpus) if extra: blast_cmd += " " + extra.strip() args = [(out_fh, blast_cmd, query, lock) for query in queries] g = Jobs(target=blastplus, args=args) g.run()
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - self: non-self - ids: valid ids use --inverse to obtain the complementary records """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff [default: %default]") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--self", default=None, choices=("strict", "loose"), help="Remove self hits. strict: matched names and spans; " \ "loose: matched names [default: %default]") p.add_option("--ids", default=None, help="path to tab or comma delimited file containing ids to " \ "retain [default: %default]") p.add_option("--inverse", action="store_true", help="Similar to grep -v, inverse [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, selfrule = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.self newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ c.is_self_hit(selfrule) or \ noids if inverse: remove = not remove if not remove: print >> fw, row.rstrip() return newblastfile
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=94) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided, the PASA Comprehensive Transcriptome protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_home("pasa") p.set_align(pctid=95, pctcov=90, intron=2000, bpsplice=3, compreh_pctcov=30) p.add_option("--aligners", default="blat,gmap", help="Specify splice aligners to use for mapping [default: %default]") p.add_option("--clean", default=False, action="store_true", help="Clean transcripts using tgi seqclean [default: %default]") p.set_cpus() p.set_grid() p.set_grid_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = which("seqclean") if clean and not seqclean: logging.error("Cannot find tgi seqclean in PATH") sys.exit() accn_extract = which(op.join(PASA_HOME, "misc_utilities", "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", "build_comprehensive_transcriptome.dbi")) cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice mkdir(pasa_db) os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) write_file(runfile, accn_extract_cmd, append=True) \ if prepare else sh(accn_extract_cmd) else: transcripts = dnfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: write_file(runfile, cleancmd, append=True) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), pctcov, pctid, bpsplice) aafw.close() aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome) aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \ " -t {0} ".format(transcripts) if ggfasta: aacmd += " --TDN {0} ".format(tdn) aacmd += " --ALIGNERS {0} -I {1}".format(",".join(aligners), opts.intron) if prepare: write_file(runfile, aacmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += "--min_per_ID {0} --min_per_aligned {1}".format(pctid, pctcov) if prepare: write_file(runfile, comprehcmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--suffix", default="fasta", help="Suffix of the sequence file in dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01) p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain suffix = opts.suffix evalue = opts.evalue pctid = opts.pctid hitlen = opts.hitlen cutoff = Cutoff(pctid, hitlen) # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, ".".join((afasta, suffix))) if not op.exists(af): # Check to avoid redownload entrez([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, ".".join((bfasta, suffix))) if not op.exists(bf): entrez([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = "blastn -dust no" cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= hitlen] if chain: logging.debug("Chain HSPs in the Blast output.") dist = 2 * hitlen # Distance to chain the HSPs hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse) o.print_graphic() if opts.outfile: fw = must_open(opts.outfile, "w") print >> fw, str(o) fw.close() return o
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - ids: valid ids Use --inverse to obtain the complementary records for the criteria above. - noself: remove self-self hits """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--noself", default=False, action="store_true", help="Remove self-self hits") p.add_option("--ids", help="Path to file with ids to retain") p.add_option("--inverse", default=False, action="store_true", help="Similar to grep -v, inverse") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, noself = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile if inverse: newblastfile += ".inverse" fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ noids if inverse: remove = not remove remove = remove or (noself and c.query == c.subject) if not remove: print >> fw, row.rstrip() fw.close() return newblastfile
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - ids: valid ids Use --inverse to obtain the complementary records for the criteria above. - noself: remove self-self hits """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--noself", default=False, action="store_true", help="Remove self-self hits") p.add_option("--ids", help="Path to file with ids to retain") p.add_option("--inverse", default=False, action="store_true", help="Similar to grep -v, inverse") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, noself = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile if inverse: newblastfile += ".inverse" fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ noids if inverse: remove = not remove remove = remove or (noself and c.query == c.subject) if not remove: print >> fw, row.rstrip() return newblastfile