def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = opts.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print >> fw, " ".join((ref, query)) print >> fw, "NUCMER" fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = opts.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print(" ".join((ref, query)), file=fw) print("NUCMER", file=fw) fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def size(args): """ find folder -type l | %prog size Get the size for all the paths that are pointed by the links """ from jcvi.utils.cbook import human_size p = OptionParser(size.__doc__) fp = sys.stdin results = [] for link_name in fp: link_name = link_name.strip() if not op.islink(link_name): continue source = get_abs_path(link_name) link_name = op.basename(link_name) filesize = op.getsize(source) results.append((filesize, link_name)) # sort by descending file size for filesize, link_name in sorted(results, reverse=True): filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True) print >>sys.stderr, "%10s\t%s" % (filesize, link_name)
def link(args): """ %prog link metafile Link source to target based on a tabular file. """ from jcvi.apps.base import mkdir p = OptionParser(link.__doc__) p.add_option("--dir", help="Place links in a subdirectory [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) meta, = args d = opts.dir if d: mkdir(d) fp = open(meta) for row in fp: source, target = row.split() source = get_abs_path(source) if d: target = op.join(d, target) lnsf(source, target, log=True)
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \ pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16): assert db, "Need to specify database fasta file." db = get_abs_path(db) nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus) cmd += " -task {0}".format(task) if wordsize: cmd += " -word_size {0}".format(wordsize) if pctid: cmd += " -perc_identity {0}".format(pctid) if best: cmd += " -max_target_seqs {0}".format(best) sh(cmd) if pctid and hitlen: blastfile = outfile filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blastfile, outfile=filtered_blastfile, pctid=pctid, hitlen=hitlen) shutil.move(filtered_blastfile, blastfile)
def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. The FASTQ files in the folder will automatically pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option("--mapped", default=False, action="store_true", help="Retain contaminated reads instead [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder, 2): align_opts = [ecoli] + p + [tag] align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"] if opts.mateorientation: align_opts += ["--mateorientation={0}".format(opts.mateorientation)] samfile, logfile = align(align_opts)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def size(args): """ find folder -type l | %prog size Get the size for all the paths that are pointed by the links """ from jcvi.utils.cbook import human_size p = OptionParser(size.__doc__) fp = sys.stdin results = [] for link_name in fp: link_name = link_name.strip() if not op.islink(link_name): continue source = get_abs_path(link_name) link_name = op.basename(link_name) filesize = op.getsize(source) results.append((filesize, link_name)) # sort by descending file size for filesize, link_name in sorted(results, reverse=True): filesize = human_size(filesize, a_kilobyte_is_1024_bytes=True) print("%10s\t%s" % (filesize, link_name), file=sys.stderr)
def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. The FASTQ files in the folder will automatically pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option("--mapped", default=False, action="store_true", help="Retain contaminated reads instead [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder, 2): align_opts = [ecoli] + p + [tag] align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"] if opts.mateorientation: align_opts += [ "--mateorientation={0}".format(opts.mateorientation) ] samfile, logfile = align(align_opts)
def link(args): """ %prog link metafile Link source to target based on a tabular file. """ from jcvi.apps.base import mkdir p = OptionParser(link.__doc__) p.add_option("--dir", help="Place links in a subdirectory") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (meta, ) = args d = opts.dir if d: mkdir(d) fp = open(meta) cwd = op.dirname(get_abs_path(meta)) for row in fp: source, target = row.split() source = op.join(cwd, source) if d: target = op.join(d, target) lnsf(source, target, log=True)
def update_abs_path(self): for r in self: path = r.value if path and op.exists(path): npath = get_abs_path(path) logging.debug("{0}={1} => {2}".format(r.tag, path, npath)) r.value = npath
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads). """ valid_modes = ("bwasw", "aln", "mem") p = OptionParser(align.__doc__) p.add_option("--mode", default="mem", choices=valid_modes, help="BWA mode [default: %default]") p.add_option("--readtype", choices=("pacbio", "pbread"), help="Read type in bwa-mem") p.set_cutoff(cutoff=800) p.set_sam_options() opts, args = p.parse_args(args) mode = opts.mode nargs = len(args) if nargs not in (2, 3): sys.exit(not p.print_help()) tag = "bwa-{0}: ".format(mode) c = mem if nargs == 2: tag += "Single-end alignment" if mode == "bwasw": c = bwasw elif mode == "aln": c = samse else: assert mode != "bwasw", "Cannot use --bwasw with paired-end mode" tag += "Paired-end alignment" if mode == "aln": c = sampe logging.debug(tag) args[0] = get_abs_path(args[0]) cmd, samfile = c(args, opts) if cmd: cmd = output_bam(cmd, samfile) bam = opts.bam unmapped = opts.unmapped sh(cmd) if unmapped: dbfile, readfile = args[:2] mopts = [samfile, "--unmapped"] if not bam: mopts += ["--sam"] mapped(mopts) FileShredder([samfile]) return samfile, None
def make_link(self, firstN=0): mkdir(self.genome) if firstN > 0: first([str(firstN), self.fastq, "--outfile={0}".format(self.link)]) return if op.islink(self.link): os.unlink(self.link) os.symlink(get_abs_path(self.fastq), self.link)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads?", ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug( "[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in ( 1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def check_index(dbfile): dbfile = get_abs_path(dbfile) safile = dbfile + ".1.bt2" if need_update(dbfile, safile): cmd = "bowtie2-build {0} {0}".format(dbfile) sh(cmd) else: logging.error("`{0}` exists. `bowtie2-build` already run.".format(safile)) return dbfile
def check_index(dbfile): dbfile = get_abs_path(dbfile) safile = dbfile + ".sa" if not op.exists(safile): cmd = "bwa index {0}".format(dbfile) sh(cmd) else: logging.error("`{0}` exists. `bwa index` already run.".format(safile)) return dbfile
def check_index(dbfile): dbfile = get_abs_path(dbfile) safile = dbfile + ".sa" if need_update(dbfile, safile): cmd = "bwa index {0}".format(dbfile) sh(cmd) else: logging.error("`{0}` exists. `bwa index` already run.".format(safile)) return dbfile
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]" ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug("[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def run_vecscreen(infile=None, outfile=None, db="UniVec_Core", pctid=None, hitlen=None): """ BLASTN parameters reference: http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html """ db = get_abs_path(db) nin = db + ".nin" run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -task blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -penalty -5 -gapopen 4 -gapextend 4 -dust yes -soft_masking true" cmd += " -searchsp 1750000000000 -evalue 0.01 -outfmt 6 -num_threads 8" sh(cmd)
def index(args): """ %prog index database.fasta Wrapper for `bowtie2-build`. Same interface. """ p = OptionParser(index.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) dbfile, = args dbfile = get_abs_path(dbfile) check_index(dbfile)
def check_index(dbfile): dbfile = get_abs_path(dbfile) dbdir, filename = op.split(dbfile) if not dbdir: dbdir = "." dbname = filename.rsplit(".", 1)[0] safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname)) if dbname == filename: dbname = filename + ".db" if need_update(dbfile, safile): cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename) sh(cmd) else: logging.error("`{0}` exists. `gmap_build` already run.".format(safile)) return dbdir, dbname
def check_index(dbfile): dbfile = get_abs_path(dbfile) dbdir, filename = op.split(dbfile) if not dbdir: dbdir = "." dbname = filename.rsplit(".", 1)[0] safile = op.join(dbdir, "{0}/{0}.salcpchilddc".format(dbname)) if dbname == filename: dbname = filename + ".db" if need_update(dbfile, safile): cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename) sh(cmd) else: logging.error("`{0}` exists. `gmap_build` already run.".format(safile)) return dbdir, dbname
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print("\t".join((k, str(size))), file=fw) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print >> fw, "\t".join((k, str(size))) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.set_sep(sep="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmd = "samtools merge -@ 8 {0} {1}".format(target, source) mm.add(files, target, cmd, remove=True) mm.write()
def cp(args): """ find folder -type l | %prog cp Copy all the softlinks to the current folder, using absolute paths """ p = OptionParser(cp.__doc__) fp = sys.stdin for link_name in fp: link_name = link_name.strip() if not op.exists(link_name): continue source = get_abs_path(link_name) link_name = op.basename(link_name) if not op.exists(link_name): os.symlink(source, link_name) logging.debug(" => ".join((source, link_name)))
def touch(args): """ find . -type l | %prog touch Linux commands `touch` wouldn't modify mtime for links, this script can. Use find to pipe in all the symlinks. """ p = OptionParser(touch.__doc__) opts, args = p.parse_args(args) fp = sys.stdin for link_name in fp: link_name = link_name.strip() if not op.islink(link_name): continue if not op.exists(link_name): continue source = get_abs_path(link_name) lnsf(source, link_name)
def soapX(args): """ %prog soapX folder tag [*.fastq] Run SOAP on a folder of paired reads and apply tag before assembly. Optional *.fastq in the argument list will be symlinked in each folder and co-assembled. """ p = OptionParser(soapX.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) folder, tag = args[:2] extra = args[2:] extra = [get_abs_path(x) for x in extra] tag = tag.split(",") for p, pf in iter_project(folder, n=3): soap_trios(p, pf, tag, extra)
def check_index(dbfile, supercat=False, go=True): if supercat: updated = False pf = dbfile.rsplit(".", 1)[0] supercatfile = pf + ".supercat" coordsfile = supercatfile + ".coords" if go and need_update(dbfile, supercatfile): cmd = "tGBS-Generate_Pseudo_Genome.pl" cmd += " -f {0} -o {1}".format(dbfile, supercatfile) sh(cmd) # Rename .coords file since gmap_build will overwrite it coordsbak = backup(coordsfile) updated = True dbfile = supercatfile + ".fasta" dbfile = get_abs_path(dbfile) dbdir, filename = op.split(dbfile) if not dbdir: dbdir = "." dbname = filename.rsplit(".", 1)[0] safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname)) if dbname == filename: dbname = filename + ".db" if not go: return dbdir, dbname if need_update(dbfile, safile): cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename) sh(cmd) else: logging.error("`{0}` exists. `gmap_build` already run.".format(safile)) if go and supercat and updated: sh("mv {0} {1}".format(coordsbak, coordsfile)) return dbdir, dbname
def prepare(args): """ %prog prepare barcode_key.csv reference.fasta Prepare TASSEL pipeline. """ valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \ "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \ "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \ "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") p = OptionParser(prepare.__doc__) p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes, help="Restriction enzyme used [default: %default]") p.set_home("tassel") p.set_aligner(aligner="bwa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) barcode, reference = args thome = opts.tassel_home reference = get_abs_path(reference) folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt", "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec") for f in folders: mkdir(f) # Build the pipeline runsh = [] o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) runsh.append(cmd) o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) runsh.append(cmd) runsh.append("cd mergedTagCounts") cmd = "python -m jcvi.apps.{0} align --cpus {1}".\ format(opts.aligner, opts.cpus) cmd += " {0} myMasterTags.cnt.fq".format(reference) runsh.append(cmd) runsh.append("cd ..") o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" cmd = run_pipeline(thome, "SAMConverterPlugin", o) runsh.append(cmd) o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" o += " -mUpd topm/myMasterTagsWithVariants.topm" o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" o += " -ref {0} -sC 1 -eC 10".format(reference) cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) runsh.append(cmd) o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) runsh.append(cmd) o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" #o += "-hLD -mnR2 0.2 -mnBonP 0.005" cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) runsh.append(cmd) runfile = "run.sh" write_file(runfile, "\n".join(runsh))
def parallel(args): """ %prog parallel genome.fasta N Partition the genome into parts and run separately. This is useful if MAKER is to be run on the grid. """ from jcvi.formats.base import split p = OptionParser(parallel.__doc__) p.set_home("maker") p.set_tmpdir(tmpdir="tmp") p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) genome, NN = args threaded = opts.threaded or 1 tmpdir = opts.tmpdir mkdir(tmpdir) tmpdir = get_abs_path(tmpdir) N = int(NN) assert 1 <= N < 1000, "Required: 1 < N < 1000!" outdir = "outdir" fs = split([genome, outdir, NN]) c = CTLFile("maker_opts.ctl") c.update_abs_path() if threaded > 1: c.update_tag("cpus", threaded) cwd = os.getcwd() dirs = [] for name in fs.names: fn = get_abs_path(name) bn = op.basename(name) dirs.append(bn) c.update_tag("genome", fn) mkdir(bn) sh("cp *.ctl {0}".format(bn)) os.chdir(bn) c.write_file("maker_opts.ctl") os.chdir(cwd) jobs = "jobs" fw = open(jobs, "w") print("\n".join(dirs), file=fw) fw.close() # Submit to grid ncmds = len(dirs) runfile = "array.sh" cmd = op.join(opts.maker_home, "bin/maker") if tmpdir: cmd += " -TMP {0}".format(tmpdir) engine = get_grid_engine() contents = arraysh.format(jobs, cmd) if engine == "SGE" \ else arraysh_ua.format(N, threaded, jobs, cmd) write_file(runfile, contents) if engine == "PBS": return # qsub script outfile = "maker.\$TASK_ID.out" p = GridProcess(runfile, outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) qsubfile = "qsub.sh" qsub = p.build() write_file(qsubfile, qsub)
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.add_option("--null", default=False, action="store_true", help="Do not write to SAM/BAM output") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == '+-': extra += "" elif mo == '-+': extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = get_abs_path(dbfile) safile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam) logfile = prefix + ".log" offset = guessoffset([readfile]) if not need_update(safile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) if opts.null: samfile = "/dev/null" cmd = output_bam(cmd, samfile) sh(cmd) print >> sys.stderr, open(logfile).read() return samfile, logfile
def prepare(args): """ %prog prepare barcode_key.csv reference.fasta Prepare TASSEL pipeline. """ valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \ "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \ "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \ "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") p = OptionParser(prepare.__doc__) p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes, help="Restriction enzyme used [default: %default]") p.set_home("tassel") p.set_aligner(aligner="bwa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) barcode, reference = args thome = opts.tassel_home reference = get_abs_path(reference) folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt", "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec") for f in folders: mkdir(f) # Build the pipeline runsh = [] o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) runsh.append(cmd) o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) runsh.append(cmd) runsh.append("cd mergedTagCounts") cmd = "python -m jcvi.apps.{0} align --cpus {1}".\ format(opts.aligner, opts.cpus) cmd += " {0} myMasterTags.cnt.fq".format(reference) runsh.append(cmd) runsh.append("cd ..") o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" cmd = run_pipeline(thome, "SAMConverterPlugin", o) runsh.append(cmd) o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" o += " -mUpd topm/myMasterTagsWithVariants.topm" o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" o += " -ref {0} -sC 1 -eC 10".format(reference) cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) runsh.append(cmd) o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) runsh.append(cmd) o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" #o += "-hLD -mnR2 0.2 -mnBonP 0.005" cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) runsh.append(cmd) runfile = "run.sh" write_file(runfile, "\n".join(runsh), meta="run script")