def aln(args): """ %prog aln database.fasta *.fastq Wrapper for `bwa aln` except this will run over a set of files. """ p = OptionParser(aln.__doc__) p.add_option("--cpus", default=32, help="Number of cpus to use [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, readfiles = args[0], args[1:] safile = check_index(dbfile, grid=grid) for readfile in readfiles: saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
def bwasw(args): """ %prog bwasw database.fasta long_read.fastq Wrapper for `bwa bwasw`. Output will be long_read.sam. """ p = OptionParser(bwasw.__doc__) set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, readfile = args safile = check_index(dbfile, grid=grid) saifile = check_aln(dbfile, readfile, grid=grid) samfile = readfile.rsplit(".", 1)[0] + ".sam" if op.exists(samfile): logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile)) return cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile) cmd += "{0}".format(extra) sh(cmd, grid=grid, outfile=samfile)
def map(args): """ %prog map reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(map.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) license = "license.properties" if not op.exists(license): sh("cp ~/{0} .".format(license)) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus 16" cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l 0.8 -s 0.98" sh(cmd, grid=opts.grid)
def map(args): """ %prog map reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(map.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus 16" cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l 0.8 -s 0.98" sh(cmd, grid=opts.grid)
def sampe(args): """ %prog sampe database.fasta read1.fq read2.fq Wrapper for `bwa sampe`. Output will be read1.sam. """ p = OptionParser(sampe.__doc__) p.add_option("--bam", default=False, action="store_true", help="write to bam file [default: %default]") p.add_option("--cpus", default=32, help="Number of cpus to use [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, read1file, read2file = args safile = check_index(dbfile, grid=grid) sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus) sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus) prefix = read1file.rsplit(".", 1)[0] samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam") if op.exists(samfile): logging.error("`{0}` exists. `bwa samse` already run.".format(samfile)) return cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file, read1file, read2file) cmd += "{0}".format(extra) if opts.bam: cmd += " | samtools view -bS -F 4 - " sh(cmd, grid=grid, outfile=samfile)
def index(args): """ %prog index database.fasta Wrapper for `bwa index`. Same interface, only adds grid submission. """ p = OptionParser(index.__doc__) set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, = args safile = check_index(dbfile, grid=grid)
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >>sys.stderr, "Running jobs on JCVI grid" extra = opts.extra lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup(p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters") g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option("--input_folder", default=None, help="input folder containing multi FASTA files of reads [default: %default]") g1.add_option("--input_file_list", default=None, help="list file containing paths to multi FASTA files of reads [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option("-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) set_grid(p) set_params(p) opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format(opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format(opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith('.fa') or file.lower().endswith('.fasta')] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning("List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile, grid=opts.grid)
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option( "--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option( "--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >> sys.stderr, "Running jobs on JCVI grid" extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith( "lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ supported_formats = ("tab", "maf", "blast") p = OptionParser(main.__doc__) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--path", help="specify LAST path") p.add_option("--format", default="blast", choices=supported_formats, help="Output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") set_params(p) set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd) cmd = "{0} -u 0".format(lastal_bin) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w") lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()