def bcf(args): """ %prog bcf fastafile bamfiles > bcffile Run mpileup on bam files. """ from jcvi.apps.grid import Jobs p = OptionParser(bcf.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] unsorted = [x for x in bamfiles if ".sorted." not in x] jargs = [[[x, "--unique"]] for x in unsorted] jobs = Jobs(index, args=jargs) jobs.run() bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles] bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles] cmd = "samtools mpileup -P ILLUMINA -E -ugDf" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) cmd += " | bcftools view -bcvg -" sh(cmd, outfile=opts.outfile)
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial( augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile, ) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def vcf(args): """ %prog vcf fastafile bamfiles > out.vcf.gz Call SNPs on bam files. """ from jcvi.apps.grid import Jobs valid_callers = ("mpileup", "freebayes") p = OptionParser(vcf.__doc__) p.set_outfile(outfile="out.vcf.gz") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the BAM files") p.add_option("--caller", default="mpileup", choices=valid_callers, help="Use variant caller") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] caller = opts.caller unsorted = [x for x in bamfiles if ".sorted." not in x] if opts.nosort: bamfiles = unsorted else: jargs = [[[x, "--unique"]] for x in unsorted] jobs = Jobs(index, args=jargs) jobs.run() bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles] bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles] if caller == "mpileup": cmd = "bcftools mpileup -Ou -f" cmd += " {} {}".format(fastafile, " ".join(bamfiles)) cmd += " | bcftools call -mv -Oz -o {}".format(opts.outfile) elif caller == "freebayes": cmd = "freebayes -f" cmd += " {} {} > {}".format(fastafile, " ".join(bamfiles), opts.outfile) sh(cmd) cmd = "bcftools index {}".format(opts.outfile) sh(cmd)
def split(args): """ %prog split pairs.fastq Split shuffled pairs into `.1.fastq` and `.2.fastq`, using `sed`. Can work on gzipped file. <http://seqanswers.com/forums/showthread.php?t=13776> """ from jcvi.apps.grid import Jobs p = OptionParser(split.__doc__) set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args gz = pairsfastq.endswith(".gz") pf = pairsfastq.replace(".gz", "").rsplit(".", 1)[0] p1 = pf + ".1.fastq" p2 = pf + ".2.fastq" cmd = "zcat" if gz else "cat" p1cmd = cmd + " {0} | sed -ne '1~8{{N;N;N;p}}'".format(pairsfastq) p2cmd = cmd + " {0} | sed -ne '5~8{{N;N;N;p}}'".format(pairsfastq) if gz: p1cmd += " | gzip" p2cmd += " | gzip" p1 += ".gz" p2 += ".gz" p1cmd += " > " + p1 p2cmd += " > " + p2 if opts.grid: sh(p1cmd, grid=True) sh(p2cmd, grid=True) else: args = [(p1cmd, ), (p2cmd, )] m = Jobs(target=sh, args=args) m.run() checkShuffleSizes(p1, p2, pairsfastq)
def parallel_musclewrap(clustfile, cpus, minsamp=0): musclewrap_minsamp = partial(musclewrap, minsamp=minsamp) if cpus == 1: return musclewrap_minsamp(clustfile) from jcvi.apps.grid import Jobs outdir = mkdtemp(dir=".") fs = split([clustfile, outdir, str(cpus), "--format=clust"]) g = Jobs(musclewrap_minsamp, fs.names) g.run() clustnames = [x.replace(".clust", ".clustS") for x in fs.names] clustSfile = clustfile.replace(".clust", ".clustS") FileMerger(clustnames, outfile=clustSfile).merge() shutil.rmtree(outdir)
def mapped(args): """ %prog mapped sam/bamfile Given an input sam/bam file, output a sam/bam file containing only the mapped reads. Optionally, extract the unmapped reads into a separate file """ import pysam from jcvi.apps.grid import Jobs p = OptionParser(mapped.__doc__) p.set_sam_options(extra=False) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) samfile, = args view_opts = [] oext, mopts = (".sam", ["-S"]) \ if samfile.endswith(".sam") else (".bam", []) flag, ext = ("-b", ".bam") if opts.bam else ("-h", ".sam") mopts.append(flag) if opts.uniq: mopts.append("-q1") ext = ".uniq{0}".format(ext) if opts.unmapped: uopts = [x for x in mopts] uoutfile = samfile.replace(oext, ".unmapped{0}".format(ext)) uopts.extend(["-f4", samfile, "-o{0}".format(uoutfile)]) view_opts.append(uopts) outfile = samfile.replace(oext, ".mapped{0}".format(ext)) mopts.extend(["-F4", samfile, "-o{0}".format(outfile)]) view_opts.append(mopts) for vo in view_opts: logging.debug('samtools view {0}'.format(" ".join(vo))) jobs = Jobs(pysam.view, [(z for z in x) for x in view_opts]) jobs.run()
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial(augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def mdownload(args): """ %prog mdownload links.txt Multiple download a list of files. Use formats.html.links() to extract the links file. """ from jcvi.apps.grid import Jobs p = OptionParser(mdownload.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) linksfile, = args links = [(x.strip(), ) for x in open(linksfile)] j = Jobs(download, links) j.run()
def mdownload(args): """ %prog mdownload links.txt Multiple download a list of files. Use formats.html.links() to extract the links file. """ from jcvi.apps.grid import Jobs p = OptionParser(mdownload.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) linksfile, = args links = [(x.strip(),) for x in open(linksfile)] j = Jobs(download, links) j.run()
def vcf(args): """ %prog vcf fastafile bamfiles > out.vcf.gz Call SNPs on bam files. """ from jcvi.apps.grid import Jobs valid_callers = ("mpileup", "freebayes") p = OptionParser(vcf.__doc__) p.set_outfile(outfile="out.vcf.gz") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the BAM files") p.add_option("--caller", default="mpileup", choices=valid_callers, help="Use variant caller [default: %default]") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] caller = opts.caller unsorted = [x for x in bamfiles if ".sorted." not in x] if opts.nosort: bamfiles = unsorted else: jargs = [[[x, "--unique"]] for x in unsorted] jobs = Jobs(index, args=jargs) jobs.run() bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles] bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles] if caller == "mpileup": cmd = "samtools mpileup -E -uf" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) cmd += " | bcftools call -vmO v" elif caller == "freebayes": cmd = "freebayes -f" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) sh(cmd, outfile=opts.outfile)
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("--format", default="BLASTN-", choices=supported_formats, help="Ooutput format [default: %default]") p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option("--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format)) p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ supported_formats = ("tab", "maf", "blast") p = OptionParser(main.__doc__) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--path", help="specify LAST path") p.add_option("--format", default="blast", choices=supported_formats, help="Output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") set_params(p) set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd) cmd = "{0} -u 0".format(lastal_bin) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w") lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ p = OptionParser(main.__doc__) p.add_option("--path", help="specify LAST path") p.add_option("--mask", default=False, action="store_true", help="invoke -c in lastdb [default: %default]") p.add_option("--format", default="blast", choices=supported_formats, help="Output format [default: %default]") p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) oappend = False if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd, outfile=opts.outfile) oappend = True u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend) if out_fh is None: return lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ supported_formats = ("tab", "maf", "blast") p = OptionParser(main.__doc__) p.add_option("--path", help="specify LAST path") p.add_option("--mask", default=False, action="store_true", help="invoke -c in lastdb [default: %default]") p.add_option("--format", default="blast", choices=supported_formats, help="Output format [default: %default]") p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) oappend = False if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd, outfile=opts.outfile) oappend = True u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend) if out_fh is None: return lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def dump(args): """ %prog dump fastbfile Export ALLPATHS fastb file to fastq file. Use --dir to indicate a previously run allpaths folder. """ p = OptionParser(dump.__doc__) p.add_option("--dir", help="Working directory [default: %default]") p.add_option("--nosim", default=False, action="store_true", help="Do not simulate qual to 50 [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastbfile, = args d = opts.dir if d: from jcvi.assembly.preprocess import export_fastq export_fastq(d, fastbfile) return sim = not opts.nosim pf = "j" if "jump" in fastbfile else "f" statsfile = "{0}.lib_stats".format(pf) if op.exists(statsfile): os.remove(statsfile) cmd = "SplitReadsByLibrary READS_IN={0}".format(fastbfile) cmd += " READS_OUT={0} QUALS=True".format(pf) sh(cmd) libs = [] fp = open(statsfile) fp.next(); fp.next() # skip two rows for row in fp: if row.strip() == "": continue libname = row.split()[0] if libname == "Unpaired": continue libs.append(libname) logging.debug("Found libraries: {0}".format(",".join(libs))) cmds = [] for libname in libs: cmd = "FastbQualbToFastq" cmd += " HEAD_IN={0}.{1}.AB HEAD_OUT={1}".format(pf, libname) cmd += " PAIRED=True PHRED_OFFSET=33" if sim: cmd += " SIMULATE_QUALS=True" if pf == 'j': cmd += " FLIP=True" cmds.append((cmd, )) m = Jobs(target=sh, args=cmds) m.run() for libname in libs: cmd = "mv {0}.A.fastq {0}.1.fastq".format(libname) sh(cmd) cmd = "mv {0}.B.fastq {0}.2.fastq".format(libname) sh(cmd)
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.add_option("--nprocs", default=1, type="int", help="number of BLAST processes to run in parallel. " + \ "split query.fa into `nprocs` chunks, " + \ "each chunk uses -num_threads=`cpus`") p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = op.join(blast_bin, blast_program) nprocs, cpus = opts.nprocs, opts.cpus if nprocs > 1: logging.debug("Dispatch job to %d processes" % nprocs) outdir = "outdir" fs = split([afasta_fn, outdir, str(nprocs)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) blast_cmd += " -num_threads {0}".format(cpus) if extra: blast_cmd += " " + extra.strip() args = [(out_fh, blast_cmd, query, lock) for query in queries] g = Jobs(target=blastplus, args=args) g.run()