def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print(a, file=fw) fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def build(args): """ %prog build current.fasta Bacteria_Virus.fasta prefix Build assembly files after a set of clean-ups: 1. Use cdhit (100%) to remove duplicate scaffolds 2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov) 3. Mask matches to UniVec_Core 4. Sort by decreasing scaffold sizes 5. Rename the scaffolds sequentially 6. Build the contigs by splitting scaffolds at gaps 7. Rename the contigs sequentially """ from jcvi.apps.cdhit import deduplicate from jcvi.apps.vecscreen import mask from jcvi.formats.fasta import sort p = OptionParser(build.__doc__) p.add_option( "--nodedup", default=False, action="store_true", help="Do not deduplicate [default: deduplicate]", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, bacteria, pf = args dd = deduplicate([fastafile, "--pctid=100" ]) if not opts.nodedup else fastafile screenfasta = screen([dd, bacteria]) tidyfasta = mask([screenfasta]) sortedfasta = sort([tidyfasta, "--sizes"]) scaffoldfasta = pf + ".assembly.fasta" format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"]) gapsplitfasta = pf + ".gapSplit.fasta" cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta) sh(cmd) contigsfasta = pf + ".contigs.fasta" format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
def build(args): """ %prog build current.fasta Bacteria_Virus.fasta prefix Build assembly files after a set of clean-ups: 1. Use cdhit (100%) to remove duplicate scaffolds 2. Screen against the bacteria and virus database (remove scaffolds 95% id, 50% cov) 3. Mask matches to UniVec_Core 4. Sort by decreasing scaffold sizes 5. Rename the scaffolds sequentially 6. Build the contigs by splitting scaffolds at gaps 7. Rename the contigs sequentially """ from jcvi.apps.cdhit import deduplicate from jcvi.apps.vecscreen import mask from jcvi.formats.fasta import sort p = OptionParser(build.__doc__) p.add_option("--nodedup", default=False, action="store_true", help="Do not deduplicate [default: deduplicate]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, bacteria, pf = args dd = deduplicate([fastafile, "--pctid=100"]) \ if not opts.nodedup else fastafile screenfasta = screen([dd, bacteria]) tidyfasta = mask([screenfasta]) sortedfasta = sort([tidyfasta, "--sizes"]) scaffoldfasta = pf + ".assembly.fasta" format([sortedfasta, scaffoldfasta, "--prefix=scaffold_", "--sequential"]) gapsplitfasta = pf + ".gapSplit.fasta" cmd = "gapSplit -minGap=10 {0} {1}".format(scaffoldfasta, gapsplitfasta) sh(cmd) contigsfasta = pf + ".contigs.fasta" format([gapsplitfasta, contigsfasta, "--prefix=contig_", "--sequential"])
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit", default="/usr/local/bin/") p.set_home("fiona", default="/usr/local/bin/") p.set_home("jellyfish", default="/usr/local/bin/") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus), "--jellyfish_home={0}".format(opts.jellyfish_home)]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home)]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5)]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf)])
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit") p.set_home("fiona") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([ diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus) ]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "bin/fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([ fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home) ]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([ cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5) ]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([ filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf) ])