def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option("--mapped", default=False, action="store_true", help="Retain contaminated reads instead [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder, 2): align_opts = [ecoli] + p + ["--bam", tag] align_opts += ["--cutoff={0}".format(opts.cutoff)] if opts.mateorientation: align_opts += ["--mateorientation={0}".format(opts.mateorientation)] samfile, logfile = align(align_opts)
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def update_abs_path(self): for r in self: path = r.value if path and op.exists(path): npath = get_abs_path(path) logging.debug("{0}={1} => {2}".format(r.tag, path, npath)) r.value = npath
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \ pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16): assert db, "Need to specify database fasta file." db = get_abs_path(db) nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus) cmd += " -task {0}".format(task) if wordsize: cmd += " -word_size {0}".format(wordsize) if pctid: cmd += " -perc_identity {0}".format(pctid) if best: cmd += " -max_target_seqs {0}".format(best) sh(cmd) if pctid and hitlen: blastfile = outfile filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blastfile, outfile=filtered_blastfile, pctid=pctid, hitlen=hitlen) shutil.move(filtered_blastfile, blastfile)
def make_link(self, firstN=0): mkdir(self.genome) if firstN > 0: first([str(firstN), self.fastq, "--outfile={0}".format(self.link)]) return if op.islink(self.link): os.unlink(self.link) os.symlink(get_abs_path(self.fastq), self.link)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ p = OptionParser(fastq.__doc__) p.add_option("--sanger", dest="sanger", default=False, action="store_true", help="Are the qv sanger encodings? [default: %default]") p.add_option("--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]") add_size_option(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] mated = (opts.size != 0) outtie = opts.outtie libname = op.basename(fastqfiles[0]).split(".")[0] libname = libname.replace("_1_sequence", "") if outtie: libname = "IlluminaMP_" + libname else: libname = "IlluminaPE_" + libname if mated: libname += "_Mated" else: if outtie: libname = "IlluminaMP_UnMated" else: libname = "IlluminaPE_UnMated" frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = CAPATH("fastqToCA") cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs if opts.sanger: cmd += " -type sanger " if outtie: cmd += " -outtie " sh(cmd, outfile=frgfile)
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads). """ valid_modes = ("bwasw", "aln", "mem") p = OptionParser(align.__doc__) p.add_option("--mode", default="mem", choices=valid_modes, help="BWA mode [default: %default]") p.set_cutoff(cutoff=800) p.set_sam_options() opts, args = p.parse_args(args) mode = opts.mode nargs = len(args) if nargs not in (2, 3): sys.exit(not p.print_help()) tag = "bwa-{0}: ".format(mode) c = mem if nargs == 2: tag += "Single-end alignment" if mode == "bwasw": c = bwasw elif mode == "aln": c = samse else: assert mode != "bwasw", "Cannot use --bwasw with paired-end mode" tag += "Paired-end alignment" if mode == "aln": c = sampe logging.debug(tag) args[0] = get_abs_path(args[0]) cmd, samfile = c(args, opts) if cmd: cmd = output_bam(cmd, samfile) bam = opts.bam unmapped = opts.unmapped sh(cmd) if unmapped: dbfile, readfile = args[:2] mopts = [samfile, "--unmapped"] if bam: mopts += ["--bam"] mapped(mopts) FileShredder([samfile]) return samfile, None
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option("--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]") p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug("[warn] long insert size {0} but not outtie".format(size)) mated = (size != 0) libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = (offset == 64) if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def index(args): """ %prog index database.fasta Wrapper for `bowtie2-build`. Same interface. """ p = OptionParser(index.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) dbfile, = args dbfile = get_abs_path(dbfile) check_index(dbfile)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) phdchoices = ("33", "64") p.add_option("--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) add_size_option(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] mated = (opts.size != 0) outtie = opts.outtie libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = CAPATH("fastqToCA") cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = (offset == 64) if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.softlink import get_abs_path from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.add_option("--sep", default="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmds = [] cmds.append("rm {0}".format(target)) cmds.append("samtools merge {0} {1}".format(target, source)) mm.add(files, target, cmds) mm.write()
def run_vecscreen(infile=None, outfile=None, db="UniVec_Core", pctid=None, hitlen=None): """ BLASTN parameters reference: http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html """ db = get_abs_path(db) nin = db + ".nin" run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -task blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -penalty -5 -gapopen 4 -gapextend 4 -dust yes -soft_masking true" cmd += " -searchsp 1750000000000 -evalue 0.01 -outfmt 6 -num_threads 8" sh(cmd)
def soapX(args): """ %prog soapX folder tag [*.fastq] Run SOAP on a folder of paired reads and apply tag before assembly. Optional *.fastq in the argument list will be symlinked in each folder and co-assembled. """ p = OptionParser(soapX.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) folder, tag = args[:2] extra = args[2:] extra = [get_abs_path(x) for x in extra] tag = tag.split(",") for p, pf in iter_project(folder, n=3): soap_trios(p, pf, tag, extra)
def prepare(args): """ %prog prepare barcode_key.csv reference.fasta Prepare TASSEL pipeline. """ valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \ "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \ "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \ "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") p = OptionParser(prepare.__doc__) p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes, help="Restriction enzyme used [default: %default]") p.set_home("tassel") p.set_aligner(aligner="bwa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) barcode, reference = args thome = opts.tassel_home reference = get_abs_path(reference) folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt", "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec") for f in folders: mkdir(f) # Build the pipeline runsh = [] o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) runsh.append(cmd) o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) runsh.append(cmd) runsh.append("cd mergedTagCounts") cmd = "python -m jcvi.apps.{0} align --cpus {1}".\ format(opts.aligner, opts.cpus) cmd += " {0} myMasterTags.cnt.fq".format(reference) runsh.append(cmd) runsh.append("cd ..") o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" cmd = run_pipeline(thome, "SAMConverterPlugin", o) runsh.append(cmd) o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" o += " -mUpd topm/myMasterTagsWithVariants.topm" o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" o += " -ref {0} -sC 1 -eC 10".format(reference) cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) runsh.append(cmd) o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) runsh.append(cmd) o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" #o += "-hLD -mnR2 0.2 -mnBonP 0.005" cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) runsh.append(cmd) runfile = "run.sh" write_file(runfile, "\n".join(runsh), meta="run script")
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == "+-": extra += "" elif mo == "-+": extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = get_abs_path(dbfile) safile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile( readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam ) logfile = prefix + ".log" offset = guessoffset([readfile]) if not need_update(safile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) cmd = output_bam(cmd, samfile) sh(cmd) print >>sys.stderr, open(logfile).read() return samfile, logfile
def parallel(args): """ %prog parallel genome.fasta N Partition the genome into parts and run separately. This is useful if MAKER is to be run on the grid. """ from jcvi.formats.base import split p = OptionParser(parallel.__doc__) p.set_home("maker") p.set_tmpdir(tmpdir="tmp") p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) genome, NN = args threaded = opts.threaded or 1 tmpdir = opts.tmpdir mkdir(tmpdir) tmpdir = get_abs_path(tmpdir) N = int(NN) assert 1 <= N < 1000, "Required: 1 < N < 1000!" outdir = "outdir" fs = split([genome, outdir, NN]) c = CTLFile("maker_opts.ctl") c.update_abs_path() if threaded > 1: c.update_tag("cpus", threaded) cwd = os.getcwd() dirs = [] for name in fs.names: fn = get_abs_path(name) bn = op.basename(name) dirs.append(bn) c.update_tag("genome", fn) mkdir(bn) sh("cp *.ctl {0}".format(bn)) os.chdir(bn) c.write_file("maker_opts.ctl") os.chdir(cwd) jobs = "jobs" fw = open(jobs, "w") print >> fw, "\n".join(dirs) fw.close() # Submit to grid ncmds = len(dirs) runfile = "array.sh" cmd = op.join(opts.maker_home, "bin/maker") if tmpdir: cmd += " -TMP {0}".format(tmpdir) engine = get_grid_engine() contents = arraysh.format(jobs, cmd) if engine == "SGE" \ else arraysh_ua.format(N, threaded, jobs, cmd) write_file(runfile, contents, meta="run script") if engine == "PBS": return # qsub script outfile = "maker.\$TASK_ID.out" p = GridProcess(runfile, outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) qsubfile = "qsub.sh" qsub = p.build() write_file(qsubfile, qsub, meta="run script")
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] if aligner != "clc": align_args += ["--bam"] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf.ljust(len(median), '0')) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)