def readlen(args): """ %prog readlen fastqfile Calculate read length, will only try the first N reads. Output min, max, and avg for each file. """ p = OptionParser(readlen.__doc__) p.set_firstN() p.add_option("--silent", default=False, action="store_true", help="Do not print read length stats") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) f, = args if not is_fastq(f): logging.debug("File `{0}` does not endswith .fastq or .fq") return 0 s = calc_readlen(f, opts.firstN) if not opts.silent: print "\t".join(str(x) for x in (f, s.min, s.max, s.mean, s.median)) return int(s.max)
def readlen(args): """ %prog readlen fastqfile Calculate read length, will only try the first N reads. Output min, max, and avg for each file. """ p = OptionParser(readlen.__doc__) p.set_firstN() p.add_option("--silent", default=False, action="store_true", help="Do not print read length stats") p.add_option("--nocheck", default=False, action="store_true", help="Do not check file type suffix") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) f, = args if (not opts.nocheck) and (not is_fastq(f)): logging.debug("File `{}` does not endswith .fastq or .fq".format(f)) return 0 s = calc_readlen(f, opts.firstN) if not opts.silent: print "\t".join(str(x) for x in (f, s.min, s.max, s.mean, s.median)) return int(s.max)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def readlen(args): """ %prog readlen fastqfile Calculate read length, will only try the first N reads. Output min, max, and avg for each file. """ p = OptionParser(readlen.__doc__) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) f, = args s = calc_readlen(f, opts.firstN) print "\t".join(str(x) for x in (f, s.min, s.max, s.mean)) return int(s.max)
def contamination(args): """ %prog contamination Ecoli.fasta genome.fasta read.fastq Check read contamination on a folder of paired reads. Use bowtie2 to compare the reads against: 1. Ecoli.fsata - this will tell us the lower bound of contamination 2. genome.fasta - this will tell us the upper bound of contamination """ from jcvi.apps.bowtie import BowtieLogFile, align p = OptionParser(contamination.__doc__) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ecoli, genome, fq = args firstN_opt = "--firstN={0}".format(opts.firstN) samfile, logfile = align([ecoli, fq, firstN_opt]) bl = BowtieLogFile(logfile) lowerbound = bl.rate samfile, logfile = align([genome, fq, firstN_opt]) bl = BowtieLogFile(logfile) upperbound = 100 - bl.rate median = (lowerbound + upperbound) / 2 clogfile = fq + ".Ecoli" fw = open(clogfile, "w") lowerbound = "{0:.1f}".format(lowerbound) upperbound = "{0:.1f}".format(upperbound) median = "{0:.1f}".format(median) print >> fw, "\t".join((fq, lowerbound, median, upperbound)) print >> sys.stderr, "{0}: Ecoli contamination rate {1}-{2}".\ format(fq, lowerbound, upperbound) fw.close()
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == "+-": extra += "" elif mo == "-+": extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = get_abs_path(dbfile) safile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile( readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam ) logfile = prefix + ".log" offset = guessoffset([readfile]) if not need_update(safile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) cmd = output_bam(cmd, samfile) sh(cmd) print >>sys.stderr, open(logfile).read() return samfile, logfile
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.add_option("--null", default=False, action="store_true", help="Do not write to SAM/BAM output") p.add_option("--fasta", default=False, action="store_true", help="Query reads are FASTA") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == '+-': extra += "" elif mo == '-+': extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped fasta = opts.fasta gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam) logfile = prefix + ".log" if not fasta: offset = guessoffset([readfile]) if not need_update(dbfile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) if fasta: cmd += " -f" else: cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) if opts.null: samfile = "/dev/null" cmd = output_bam(cmd, samfile) sh(cmd) print(open(logfile).read(), file=sys.stderr) return samfile, logfile