def checkShuffleSizes(p1, p2, pairsfastq, extra=0): from jcvi.apps.base import getfilesize pairssize = getfilesize(pairsfastq) p1size = getfilesize(p1) p2size = getfilesize(p2) assert pairssize == p1size + p2size + extra, "The sizes do not add up: {0} + {1} + {2} != {3}".format( p1size, p2size, extra, pairssize )
def checkShuffleSizes(p1, p2, pairsfastq, extra=0): from jcvi.apps.base import getfilesize pairssize = getfilesize(pairsfastq) p1size = getfilesize(p1) p2size = getfilesize(p2) assert pairssize == p1size + p2size + extra, \ "The sizes do not add up: {0} + {1} + {2} != {3}".\ format(p1size, p2size, extra, pairssize)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size") p.add_option( "--coverage", default=40, type="int", help="Expected sequence coverage", ) p.add_option("--prefix", default="jf", help="Database prefix") p.add_option( "--nohist", default=False, action="store_true", help="Do not print histogram", ) p.set_home("jellyfish") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".format( human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) jfcmd = op.join(opts.jellyfish_home, "jellyfish") cmd = jfcmd cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def is_matching_gz(origfile, gzfile): if not op.exists(origfile): return False if not op.exists(gzfile): return False return getfilesize(origfile) == getfilesize(gzfile)
def diginorm(args): """ %prog diginorm fastqfile Run K-mer based normalization. Based on tutorial: <http://ged.msu.edu/angus/diginorm-2012/tutorial.html> Assume input is either an interleaved pairs file, or two separate files. To set up khmer: $ git clone git://github.com/ged-lab/screed.git $ git clone git://github.com/ged-lab/khmer.git $ cd screed $ python setup.py install $ cd ../khmer $ make test $ export PYTHONPATH=~/export/khmer """ from jcvi.formats.fastq import shuffle, pairinplace, split from jcvi.apps.base import getfilesize p = OptionParser(diginorm.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end reads") p.add_option("--tablesize", help="Memory size") p.add_option("--npass", default="1", choices=("1", "2"), help="How many passes of normalization") p.set_depth(depth=50) p.set_home("khmer", default="/usr/local/bin/") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) if len(args) == 2: fastq = shuffle(args + ["--tag"]) else: fastq, = args kh = opts.khmer_home depth = opts.depth PE = not opts.single sys.path.insert(0, op.join(kh, "python")) pf = fastq.rsplit(".", 1)[0] keepfile = fastq + ".keep" hashfile = pf + ".kh" mints = 10000000 ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints) norm_cmd = op.join(kh, "normalize-by-median.py") filt_cmd = op.join(kh, "filter-abund.py") if need_update(fastq, (hashfile, keepfile)): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts) if PE: cmd += " -p" cmd += " -s {0} {1}".format(hashfile, fastq) sh(cmd) abundfiltfile = keepfile + ".abundfilt" if need_update((hashfile, keepfile), abundfiltfile): cmd = filt_cmd cmd += " {0} {1}".format(hashfile, keepfile) sh(cmd) if opts.npass == "1": seckeepfile = abundfiltfile else: seckeepfile = abundfiltfile + ".keep" if need_update(abundfiltfile, seckeepfile): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2) cmd += " {0}".format(abundfiltfile) sh(cmd) if PE: pairsfile = pairinplace([seckeepfile, "--base={0}".format(pf + "_norm"), "--rclip=2"]) split([pairsfile])