def bwasw(args): """ %prog bwasw database.fasta long_read.fastq Wrapper for `bwa bwasw`. Output will be long_read.sam. """ p = OptionParser(bwasw.__doc__) set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, readfile = args safile = check_index(dbfile, grid=grid) saifile = check_aln(dbfile, readfile, grid=grid) samfile = readfile.rsplit(".", 1)[0] + ".sam" if op.exists(samfile): logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile)) return cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile) cmd += "{0}".format(extra) sh(cmd, grid=grid, outfile=samfile)
def txt(args): """ %prog txt casfile convert binary CAS file to tabular output using CLC assembly_table """ p = OptionParser(txt.__doc__) p.add_option("-m", dest="multi", default=False, action="store_true", help="report multi-matches [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) grid = opts.grid casfile, = args txtfile = casfile.replace(".cas", ".txt") assert op.exists(casfile) cmd = "assembly_table -n -s -p " if opts.multi: cmd += "-m " cmd += casfile sh(cmd, grid=grid, outfile=txtfile) return txtfile
def aln(args): """ %prog aln database.fasta *.fastq Wrapper for `bwa aln` except this will run over a set of files. """ p = OptionParser(aln.__doc__) p.add_option("--cpus", default=32, help="Number of cpus to use [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, readfiles = args[0], args[1:] safile = check_index(dbfile, grid=grid) for readfile in readfiles: saifile = check_aln(dbfile, readfile, grid=grid, cpus=opts.cpus)
def split(args): """ %prog split casfile 1 10 split the binary casfile by using CLCbio `sub_assembly` program, the two numbers are starting and ending index for the `reference`; useful to split one big assembly per contig """ p = OptionParser(split.__doc__) set_grid(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) casfile, start, end = args start = int(start) end = int(end) split_cmd = "sub_assembly -a {casfile} -o sa.{i}.cas -s {i} " + \ "-e sa.{i}.pairs.fasta -f sa.{i}.fragments.fasta -g sa.{i}.ref.fasta" for i in range(start, end + 1): cmd = split_cmd.format(casfile=casfile, i=i) sh(cmd, grid=opts.grid)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-454` to remove duplicate reads. """ p = OptionParser(deduplicate.__doc__) p.add_option("--identity", default=.98, type="float", help="Sequence identity threshold [default: %default]") p.add_option("--cpus", default=0, type="int", help="Number of CPUs to use, 0=unlimited [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args from jcvi.apps.command import CDPATH cmd = CDPATH("cd-hit-454") cmd += " -c {0}".format(opts.identity) cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd, grid=opts.grid)
def trim(args): """ %prog trim fastqfile Wraps `fastx_trimmer` to trim from begin or end of reads. """ p = OptionParser(trim.__doc__) set_grid(p) p.add_option("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.") p.add_option("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) grid = opts.grid fastqfile, = args base = op.basename(fastqfile).split(".")[0] fq = base + ".ntrimmed.fastq" cmd = "fastx_trimmer -Q33 " if opts.first: cmd += "-f {0.first} ".format(opts) if opts.last: cmd += "-l {0.last} ".format(opts) sh(cmd, grid=grid, infile=fastqfile, outfile=fq)
def map(args): """ %prog map reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(map.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) license = "license.properties" if not op.exists(license): sh("cp ~/{0} .".format(license)) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus 16" cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l 0.8 -s 0.98" sh(cmd, grid=opts.grid)
def map(args): """ %prog map reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(map.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus 16" cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l 0.8 -s 0.98" sh(cmd, grid=opts.grid)
def sff(args): """ %prog sff sffiles Convert reads formatted as 454 SFF file, and convert to CA frg file. Turn --nodedup on if another deduplication mechanism is used (e.g. CD-HIT-454). See assembly.sff.deduplicate(). """ p = OptionParser(sff.__doc__) p.add_option("--prefix", dest="prefix", default=None, help="Output frg filename prefix") p.add_option("--nodedup", default=False, action="store_true", help="Do not remove duplicates [default: %default]") set_grid(p) add_size_option(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) grid = opts.grid sffiles = args plates = [x.split(".")[0].split("_")[-1] for x in sffiles] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if len(plates) > 1: plate = plates[0][:-1] + 'X' else: plate = "_".join(plates) if mated: libname = "Titan{0}Kb-".format(opts.size / 1000) + plate else: libname = "TitanFrags-" + plate if opts.prefix: libname = opts.prefix cmd = CAPATH("sffToCA") cmd += " -libraryname {0} -output {0} ".format(libname) cmd += " -clear 454 -trim chop " if mated: cmd += " -linker titanium -insertsize {0} {1} ".format(mean, sv) if opts.nodedup: cmd += " -nodedup " cmd += " ".join(sffiles) sh(cmd, grid=grid)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ p = OptionParser(fasta.__doc__) p.add_option("-m", dest="matefile", default=None, help="matepairs file") set_grid(p) add_size_option(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) grid = opts.grid fastafile, = args plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = "SangerFrags-" + plate frgfile = libname + ".frg" qualfile = make_qual(fastafile) if mated: if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = CAPATH("convert-fasta-to-v2.pl") cmd += " -l {0} -s {1} -q {2} ".\ format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, grid=grid, outfile=frgfile)
def trim(args): """ %prog trim fastqfiles Use `quality_trim` to trim fastq files. If there are two fastqfiles inputted, it is assumed as pairs of fastqs. """ p = OptionParser(trim.__doc__) # There are many more options from `quality_trim`, but most useful twos are # quality cutoff (-c) and length cutoff (-m) p.add_option("-c", "--cutoff", dest="cutoff", type="int", default=20, help="Set the minimum quality for a good nucleotide. " +\ "[default: %default]") p.add_option("-m", "--minlength", dest="minlength", type="int", default=30, help="Set the minimum length of output reads. " +\ "[default: %default]") p.add_option("--offset", dest="offset", type="int", default=64, help="Set the ascii offset value in fastq [default: %default]") p.add_option("--fasta", dest="fasta", default=False, action="store_true", help="Output fasta sequence? [default: fastq]") set_grid(p) opts, args = p.parse_args(args) largs = len(args) if largs not in (1, 2): sys.exit(p.print_help()) paired = (largs == 2) fastqfile1 = args[0] assert op.exists(fastqfile1) suffix = "fasta" if opts.fasta else "fastq" if paired: fastqfile2 = args[1] assert op.exists(fastqfile2) prefix = fastqfile1.split('.')[0] cmd = "quality_trim -c {0.cutoff} -m {0.minlength} -f {0.offset} ".format(opts) if paired: cmd += "-r -i {0} {1} ".format(fastqfile1, fastqfile2) cmd += "-p {0}.pairs.{1} ".format(prefix, suffix) else: cmd += "-r {0} ".format(fastqfile1) cmd += "-o {0}.fragments.{1}".format(prefix, suffix) sh(cmd, grid=opts.grid)
def split(args): """ %prog split pairs.fastq Split shuffled pairs into `.1.fastq` and `.2.fastq`, using `sed`. Can work on gzipped file. <http://seqanswers.com/forums/showthread.php?t=13776> """ from jcvi.apps.grid import Jobs p = OptionParser(split.__doc__) set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args gz = pairsfastq.endswith(".gz") pf = pairsfastq.replace(".gz", "").rsplit(".", 1)[0] p1 = pf + ".1.fastq" p2 = pf + ".2.fastq" cmd = "zcat" if gz else "cat" p1cmd = cmd + " {0} | sed -ne '1~8{{N;N;N;p}}'".format(pairsfastq) p2cmd = cmd + " {0} | sed -ne '5~8{{N;N;N;p}}'".format(pairsfastq) if gz: p1cmd += " | gzip" p2cmd += " | gzip" p1 += ".gz" p2 += ".gz" p1cmd += " > " + p1 p2cmd += " > " + p2 if opts.grid: sh(p1cmd, grid=True) sh(p2cmd, grid=True) else: args = [(p1cmd, ), (p2cmd, )] m = Jobs(target=sh, args=args) m.run() checkShuffleSizes(p1, p2, pairsfastq)
def sampe(args): """ %prog sampe database.fasta read1.fq read2.fq Wrapper for `bwa sampe`. Output will be read1.sam. """ p = OptionParser(sampe.__doc__) p.add_option("--bam", default=False, action="store_true", help="write to bam file [default: %default]") p.add_option("--cpus", default=32, help="Number of cpus to use [default: %default]") set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, read1file, read2file = args safile = check_index(dbfile, grid=grid) sai1file = check_aln(dbfile, read1file, grid=grid, cpus=opts.cpus) sai2file = check_aln(dbfile, read2file, grid=grid, cpus=opts.cpus) prefix = read1file.rsplit(".", 1)[0] samfile = (prefix + ".bam") if opts.bam else (prefix + ".sam") if op.exists(samfile): logging.error("`{0}` exists. `bwa samse` already run.".format(samfile)) return cmd = "bwa sampe {0} {1} {2} {3} {4} ".format(dbfile, sai1file, sai2file, read1file, read2file) cmd += "{0}".format(extra) if opts.bam: cmd += " | samtools view -bS -F 4 - " sh(cmd, grid=grid, outfile=samfile)
def index(args): """ %prog index database.fasta Wrapper for `bwa index`. Same interface, only adds grid submission. """ p = OptionParser(index.__doc__) set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, = args safile = check_index(dbfile, grid=grid)
def convert(args): """ %prog convert in.fastq out.fastq illumina fastq quality encoding uses offset 64, and sanger uses 33. This script creates a new file with the correct encoding """ supported_qvs = ("illumina", "sanger") p = OptionParser(convert.__doc__) p.add_option("-Q", dest="infastq", default="illumina", choices=supported_qvs, help="input qv, one of {0} [default: %default]".\ format("|".join(supported_qvs))) p.add_option("-q", dest="outfastq", default="sanger", choices=supported_qvs, help="output qv, one of {0} [default: %default]".\ format("|".join(supported_qvs))) set_grid(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) infastq, outfastq = args from jcvi.apps.command import EMBOSSPATH seqret = EMBOSSPATH("seqret") if infastq.endswith(".gz"): cmd = "zcat {0} | ".format(infastq) cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\ format(opts.infastq, opts.outfastq) cmd += " | gzip > {0}".format(outfastq) else: cmd = seqret + " fastq-{0}::{1} fastq-{2}::{3}".\ format(opts.infastq, infastq, opts.outfastq, outfastq) sh(cmd, grid=opts.grid) return outfastq
def info(args): """ %prog info casfile <fastafile> Wraps around `assembly_info` and get the following block. General info: Read info: Coverage info: In particular, the read info will be reorganized so that it shows the percentage of unmapped, mapped, unique and multi-hit reads. When --coverage is used, the program expects a second fastafile to replace the contig IDs with real ones. RPKM = 10^9 x C / NL, which is really just simply C/N C = the number of mappable reads that felt onto the gene's exons N = total number of mappable reads in the experiment L = the sum of the exons in base pairs. """ from jcvi.utils.cbook import percentage p = OptionParser(info.__doc__) p.add_option("--coverage", default=False, action="store_true", help="Generate coverage output, replacing IDs [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) casfile = args[0] pf = casfile.rsplit(".", 1)[0] if opts.coverage: assert len(args) == 2, "You need a fastafile when using --coverage" coveragefile = pf + ".coverage" fw = open(coveragefile, "w") infofile = pf + ".info" cmd = "assembly_info {0}".format(casfile) if not op.exists(infofile): sh(cmd, outfile=infofile, grid=opts.grid) inreadblock = False incontigblock = False fp = open(infofile) row = fp.readline() while row: if row.startswith("Read info:"): inreadblock = True elif row.startswith("Contig info:"): incontigblock = True # Following looks like a hack, but to keep compatible between # CLC 3.20 and CLC 4.0 beta if inreadblock: atoms = row.split('s') last = atoms[-1].split()[0] if len(atoms) > 1 else "0" srow = row.strip() if srow.startswith("Reads"): reads = int(last) if srow.startswith("Unmapped") or srow.startswith("Unassembled"): unmapped = int(last) if srow.startswith("Mapped") or srow.startswith("Assembled"): mapped = int(last) if srow.startswith("Multi"): multihits = int(last) if row.startswith("Coverage info:"): # Print the Read info: block print "Read info:" assert mapped + unmapped == reads unique = mapped - multihits print print "Total reads: {0}".format(reads) print "Unmapped reads: {0}".format(percentage(unmapped, reads, False)) print "Mapped reads: {0}".format(percentage(mapped, reads, False)) print "Unique reads: {0}".format(percentage(unique, reads, False)) print "Multi hit reads: {0}".\ format(percentage(multihits, reads, False)) print inreadblock = False if incontigblock and opts.coverage: fastafile = args[1] s = Sizes(fastafile) while row: atoms = row.split() if len(atoms) == 4 and atoms[0][0] != "C": # Contig # Contig Sites Reads Coverage contig, sites, reads, coverage = atoms contig = int(contig) - 1 size = s.sizes[contig] contig = s.ctgs[contig] assert size == int(sites) # See formula above rpkm = 1e9 * int(reads) / (size * mapped) print >> fw, "\t".join((contig, sites, reads, "{0:.1f}".format(rpkm))) row = fp.readline() row = fp.readline()
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >>sys.stderr, "Running jobs on JCVI grid" extra = opts.extra lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option( "--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)
def info(args): """ %prog info casfile <fastafile> Wraps around `assembly_info` and get the following block. General info: Read info: Coverage info: In particular, the read info will be reorganized so that it shows the percentage of unmapped, mapped, unique and multi-hit reads. When --coverage is used, the program expects a second fastafile to replace the contig IDs with real ones. RPKM = 10^9 x C / NL, which is really just simply C/N C = the number of mappable reads that felt onto the gene's exons N = total number of mappable reads in the experiment L = the sum of the exons in base pairs. """ from jcvi.utils.cbook import percentage p = OptionParser(info.__doc__) p.add_option( "--coverage", default=False, action="store_true", help="Generate coverage output, replacing IDs [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) casfile = args[0] pf = casfile.rsplit(".", 1)[0] if opts.coverage: assert len(args) == 2, "You need a fastafile when using --coverage" coveragefile = pf + ".coverage" fw = open(coveragefile, "w") infofile = pf + ".info" cmd = "assembly_info {0}".format(casfile) if not op.exists(infofile): sh(cmd, outfile=infofile, grid=opts.grid) inreadblock = False incontigblock = False fp = open(infofile) row = fp.readline() while row: if row.startswith("Read info:"): inreadblock = True elif row.startswith("Contig info:"): incontigblock = True # Following looks like a hack, but to keep compatible between # CLC 3.20 and CLC 4.0 beta if inreadblock: atoms = row.split('s') last = atoms[-1].split()[0] if len(atoms) > 1 else "0" srow = row.strip() if srow.startswith("Reads"): reads = int(last) if srow.startswith("Unmapped") or srow.startswith("Unassembled"): unmapped = int(last) if srow.startswith("Mapped") or srow.startswith("Assembled"): mapped = int(last) if srow.startswith("Multi"): multihits = int(last) if row.startswith("Coverage info:"): # Print the Read info: block print "Read info:" assert mapped + unmapped == reads unique = mapped - multihits print print "Total reads: {0}".format(reads) print "Unmapped reads: {0}".format( percentage(unmapped, reads, False)) print "Mapped reads: {0}".format( percentage(mapped, reads, False)) print "Unique reads: {0}".format( percentage(unique, reads, False)) print "Multi hit reads: {0}".\ format(percentage(multihits, reads, False)) print inreadblock = False if incontigblock and opts.coverage: fastafile = args[1] s = Sizes(fastafile) while row: atoms = row.split() if len(atoms) == 4 and atoms[0][0] != "C": # Contig # Contig Sites Reads Coverage contig, sites, reads, coverage = atoms contig = int(contig) - 1 size = s.sizes[contig] contig = s.ctgs[contig] assert size == int(sites) # See formula above rpkm = 1e9 * int(reads) / (size * mapped) print >> fw, "\t".join( (contig, sites, reads, "{0:.1f}".format(rpkm))) row = fp.readline() row = fp.readline()
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option( "--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option( "--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >> sys.stderr, "Running jobs on JCVI grid" extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith( "lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup(p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters") g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option("--input_folder", default=None, help="input folder containing multi FASTA files of reads [default: %default]") g1.add_option("--input_file_list", default=None, help="list file containing paths to multi FASTA files of reads [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option("-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) set_grid(p) set_params(p) opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format(opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format(opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith('.fa') or file.lower().endswith('.fasta')] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning("List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile, grid=opts.grid)