def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit", default="/usr/local/bin/") p.set_home("fiona", default="/usr/local/bin/") p.set_home("jellyfish", default="/usr/local/bin/") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus), "--jellyfish_home={0}".format(opts.jellyfish_home)]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home)]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5)]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf)])
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit") p.set_home("fiona") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([ diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus) ]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "bin/fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([ fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home) ]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([ cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5) ]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([ filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf) ])