def diginorm(args): """ %prog diginorm fastqfile Run K-mer based normalization. Based on tutorial: <http://ged.msu.edu/angus/diginorm-2012/tutorial.html> Assume input is either an interleaved pairs file, or two separate files. To set up khmer: $ git clone git://github.com/ged-lab/screed.git $ git clone git://github.com/ged-lab/khmer.git $ cd screed $ python setup.py install $ cd ../khmer $ make test $ export PYTHONPATH=/root/khmer/python """ from jcvi.formats.fastq import shuffle, pairinplace, split p = OptionParser(diginorm.__doc__) p.set_depth() p.set_home("khmer") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) if len(args) == 2: fastq = shuffle(args + ["--tag"]) else: fastq, = args kh = opts.khmer_home depth = opts.depth sys.path.insert(0, op.join(kh, "python")) pf = fastq.rsplit(".", 1)[0] hashfile = pf + ".kh" keepfile = fastq + ".keep" norm_cmd = op.join(kh, "scripts/normalize-by-median.py") filt_cmd = op.join(kh, "scripts/filter-abund.py") if need_update(fastq, (hashfile, keepfile)): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x 2.5e8 -p".format(depth) cmd += " --savehash {0} {1}".format(hashfile, fastq) sh(cmd) abundfiltfile = keepfile + ".abundfilt" if need_update((hashfile, keepfile), abundfiltfile): cmd = filt_cmd cmd += " {0} {1}".format(hashfile, keepfile) sh(cmd) seckeepfile = abundfiltfile + ".keep" if need_update(abundfiltfile, seckeepfile): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x 1e8".format(depth - 5) cmd += " {0}".format(abundfiltfile) sh(cmd) pairsfile = pairinplace([seckeepfile, "--base={0}".format(pf + "_norm"), "--rclip=2"]) split([pairsfile])
def diginorm(args): """ %prog diginorm fastqfile Run K-mer based normalization. Based on tutorial: <http://ged.msu.edu/angus/diginorm-2012/tutorial.html> Assume input is either an interleaved pairs file, or two separate files. To set up khmer: $ git clone git://github.com/ged-lab/screed.git $ git clone git://github.com/ged-lab/khmer.git $ cd screed $ python setup.py install $ cd ../khmer $ make test $ export PYTHONPATH=~/export/khmer """ from jcvi.formats.fastq import shuffle, pairinplace, split from jcvi.apps.base import getfilesize p = OptionParser(diginorm.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end reads") p.add_option("--tablesize", help="Memory size") p.add_option("--npass", default="1", choices=("1", "2"), help="How many passes of normalization") p.set_depth(depth=50) p.set_home("khmer", default="/usr/local/bin/") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) if len(args) == 2: fastq = shuffle(args + ["--tag"]) else: fastq, = args kh = opts.khmer_home depth = opts.depth PE = not opts.single sys.path.insert(0, op.join(kh, "python")) pf = fastq.rsplit(".", 1)[0] keepfile = fastq + ".keep" hashfile = pf + ".kh" mints = 10000000 ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints) norm_cmd = op.join(kh, "normalize-by-median.py") filt_cmd = op.join(kh, "filter-abund.py") if need_update(fastq, (hashfile, keepfile)): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts) if PE: cmd += " -p" cmd += " -s {0} {1}".format(hashfile, fastq) sh(cmd) abundfiltfile = keepfile + ".abundfilt" if need_update((hashfile, keepfile), abundfiltfile): cmd = filt_cmd cmd += " {0} {1}".format(hashfile, keepfile) sh(cmd) if opts.npass == "1": seckeepfile = abundfiltfile else: seckeepfile = abundfiltfile + ".keep" if need_update(abundfiltfile, seckeepfile): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2) cmd += " {0}".format(abundfiltfile) sh(cmd) if PE: pairsfile = pairinplace([seckeepfile, "--base={0}".format(pf + "_norm"), "--rclip=2"]) split([pairsfile])