def array(args): """ %prog array commands.list Parallelize a set of commands on grid using array jobs. """ p = OptionParser(array.__doc__) p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) cmds, = args fp = open(cmds) N = sum(1 for x in fp) fp.close() pf = cmds.rsplit(".", 1)[0] runfile = pf + ".sh" assert runfile != cmds, "Commands list file should not have a `.sh` extension" engine = get_grid_engine() threaded = opts.threaded or 1 contents = arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format(N, threaded, cmds) write_file(runfile, contents) if engine == "PBS": return outfile = "{0}.{1}.out".format(pf, "\$TASK_ID") errfile = "{0}.{1}.err".format(pf, "\$TASK_ID") p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=errfile, arr=ncmds, grid_opts=opts) p.start()
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def main(): """ %prog scriptname.py create a minimal boilerplate for a new script """ p = OptionParser(main.__doc__) p.add_option("-g", "--graphic", default=False, action="store_true", help="Create boilerplate for a graphic script") opts, args = p.parse_args() if len(args) != 1: sys.exit(not p.print_help()) script, = args template = graphic_template if opts.graphic else default_template write_file(script, template) message = "template writes to `{0}`".format(script) if opts.graphic: message = "graphic " + message message = message[0].upper() + message[1:] logging.debug(message)
def somatic(args): """ %prog somatic ref.fasta *.bam > somatic.sh Useful to identify somatic mutations in each sample compared to all other samples. Script using SPEEDSEQ-somatic will be written to stdout. """ p = OptionParser(somatic.__doc__) opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, bams = args[0], args[1:] tcmd = "~/export/speedseq/bin/speedseq somatic" tcmd += " -t 32 -F .2 -C 3 -q 30" cmds = [] for b in bams: pf = b.split(".")[0] cmd = tcmd cmd += " -o {0}".format(pf) others = ",".join(sorted(set(bams) - set([b]))) cmd += " {0} {1} {2}".format(ref, others, b) cmds.append(cmd) write_file("somatic.sh", "\n".join(cmds))
def array(args): """ %prog array commands.list Parallelize a set of commands on grid using array jobs. """ p = OptionParser(array.__doc__) p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) cmds, = args fp = open(cmds) ncmds = sum(1 for x in fp) fp.close() pf = cmds.rsplit(".", 1)[0] runfile = pf + ".sh" assert runfile != cmds, \ "Commands list file should not have a `.sh` extension" contents = arraysh.format(cmds) write_file(runfile, contents, meta="run script") outfile = "{0}.{1}.out".format(pf, "\$TASK_ID") p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) p.start()
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def prepare(args): """ %prog prepare genomesize *.fastq Prepare MERACULOUS configuation file. Genome size should be entered in Mb. """ p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=51, type="int", help="K-mer size") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) genomesize = float(args[0]) / 1000 fnames = args[1:] for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) s = comment_banner("Meraculous params file") + "\n" s += comment_banner("Basic parameters") + "\n" s += "# Describe the libraries ( one line per library )\n" s += "# " + " ".join(header.split()) + "\n" libs = get_libs(fnames) lib_seqs = [] rank = 0 for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 library_name = lib.library_name name = library_name.replace("-", "") wildcard = "{0}*.1.*,{0}*.2.*".format(library_name) rl = max(readlen([x]) for x in fs) lib_seq = lib.get_lib_seq(wildcard, name, rl, rank) lib_seqs.append(lib_seq) s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n" params = [("genome_size", genomesize), ("is_diploid", 0), ("mer_size", opts.K), ("num_prefix_blocks", 1), ("no_read_validation", 0), ("local_num_procs", opts.cpus)] s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n" cfgfile = "meraculous.config" write_file(cfgfile, s, tee=True) s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\ .format(cfgfile) runsh = "run.sh" write_file(runsh, s)
def dn(args): """ %prog dn folder Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain "_1_" and "_2_". """ p = OptionParser(dn.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.set_home("trinity") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args paired = opts.paired thome = opts.trinity_home tfolder = folder + "_DN" cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = glob("../" + folder + "/*") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x] assert len(f1) == len(f2) r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: r = "single.fastq" reads = ((flist, r), ) for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity.pl") cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus) if paired: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --single {0}".format(reads[0][-1]) runfile = "run.sh" write_file(runfile, cmd, meta="run script") os.chdir(cwd)
def snp(args): """ %prog snp reference.fasta Run SNP calling on GSNAP native output after apps.gsnap.align --snp. Files *native.gz in the current folder will be used as input. """ p = OptionParser(snp.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ref, = args runfile = "align.sh" write_file(runfile, alignsh.format(opts.cpus, ref))
def tigrload(args): """ %prog tigrload db ev_type Load EVM results into TIGR db. Actually, just write a load.sh script. The ev_type should be set, e.g. "EVM1", "EVM2", etc. """ p = OptionParser(tigrload.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) db, ev_type = args runfile = "load.sh" contents = EVMLOAD.format(db, ev_type) write_file(runfile, contents, meta="run script")
def write_libraries(fastqs, aligner=None): from jcvi.assembly.base import get_libs libs = get_libs(fastqs) assert libs libtxt = "libraries.txt" contents = [] for i, (lib, fns) in enumerate(libs): fns = " ".join(fns) pe = "RF" if lib.read_orientation == "outward" else "FR" cc = ["lib{0}".format(i + 1), fns, lib.size, 0.75, pe] if aligner: cc.insert(1, aligner) libline = " ".join(str(x) for x in cc) contents.append(libline) write_file(libtxt, "\n".join(contents), tee=True) return libtxt
def tigrprepare(args): """ %prog tigrprepare asmbl.fasta asmbl.ids db pasa.terminal_exons.gff3 Run EVM in TIGR-only mode. """ p = OptionParser(tigrprepare.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) fastafile, asmbl_id, db, pasa_db = args if asmbl_id == "all": idsfile = fastafile + ".ids" if need_update(fastafile, idsfile): ids([fastafile, "-o", idsfile]) else: idsfile = asmbl_id oneid = next(open(idsfile)).strip() weightsfile = "weights.txt" if need_update(idsfile, weightsfile): cmd = "$EVM/TIGR-only/create_sample_weights_file.dbi" cmd += " {0} {1} | tee weights.txt".format(db, oneid) sh(cmd) evs = [ "gene_predictions.gff3", "transcript_alignments.gff3", "protein_alignments.gff3", ] if need_update(weightsfile, evs): cmd = "$EVM/TIGR-only/write_GFF3_files.dbi" cmd += " --db {0} --asmbl_id {1} --weights {2}".format(db, idsfile, weightsfile) sh(cmd) evs[1] = fix_transcript() partition(evs) runfile = "run.sh" contents = EVMRUN.format(*evs) write_file(runfile, contents)
def array(args): """ %prog array commands.list Parallelize a set of commands on grid using array jobs. """ p = OptionParser(array.__doc__) p.set_grid_opts(array=True) p.set_params(prog="grid") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (cmds, ) = args fp = open(cmds) N = sum(1 for _ in fp) fp.close() pf = cmds.rsplit(".", 1)[0] runfile = pf + ".sh" assert runfile != cmds, "Commands list file should not have a `.sh` extension" engine = get_grid_engine() threaded = opts.threaded or 1 contents = (arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format( N, threaded, cmds)) write_file(runfile, contents) if engine == "PBS": return outfile = "{0}.{1}.out".format(pf, r"\$TASK_ID") errfile = "{0}.{1}.err".format(pf, r"\$TASK_ID") p = GridProcess( "sh {0}".format(runfile), outfile=outfile, errfile=errfile, arr=N, extra_opts=opts.extra, grid_opts=opts, ) p.start()
def tigrprepare(args): """ %prog tigrprepare asmbl.fasta asmbl.ids db pasa.terminal_exons.gff3 Run EVM in TIGR-only mode. """ p = OptionParser(tigrprepare.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) fastafile, asmbl_id, db, pasa_db = args if asmbl_id == 'all': idsfile = fastafile + ".ids" if need_update(fastafile, idsfile): ids([fastafile, "-o", idsfile]) else: idsfile = asmbl_id oneid = open(idsfile).next().strip() weightsfile = "weights.txt" if need_update(idsfile, weightsfile): cmd = "$EVM/TIGR-only/create_sample_weights_file.dbi" cmd += " {0} {1} | tee weights.txt".format(db, oneid) sh(cmd) evs = ["gene_predictions.gff3", "transcript_alignments.gff3", "protein_alignments.gff3"] if need_update(weightsfile, evs): cmd = "$EVM/TIGR-only/write_GFF3_files.dbi" cmd += " --db {0} --asmbl_id {1} --weights {2}".\ format(db, idsfile, weightsfile) sh(cmd) evs[1] = fix_transcript() partition(evs) runfile = "run.sh" contents = EVMRUN.format(*evs) write_file(runfile, contents, meta="run script")
def merge(args): """ %prog merge outdir output.gff Follow-up command after grid jobs are completed after parallel(). """ from jcvi.formats.gff import merge as gmerge p = OptionParser(merge.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, outputgff = args fsnames, suffix = get_fsnames(outdir) nfs = len(fsnames) cmd = op.join(opts.maker_home, "bin/gff3_merge") outfile = "merge.sh" write_file(outfile, mergesh.format(suffix, cmd)) # Generate per split directory # Note that gff3_merge write to /tmp, so I limit processes here to avoid # filling up disk space sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames)) # One final output gffnames = glob("*.all.gff") assert len(gffnames) == nfs # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area gfflist = "gfflist" fw = open(gfflist, "w") print("\n".join(gffnames), file=fw) fw.close() nlines = sum(1 for x in open(gfflist)) assert nlines == nfs # Be extra, extra careful to include all results gmerge([gfflist, "-o", outputgff]) logging.debug("Merged GFF file written to `{0}`".format(outputgff))
def close(args): """ %prog close scaffolds.fasta PE*.fastq Run GapFiller to fill gaps. """ p = OptionParser(close.__doc__) p.set_home("gapfiller") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) scaffolds = args[0] libtxt = write_libraries(args[1:], aligner="bwa") cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:]) cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def prepare(args): """ %prog prepare alignAssembly.config est.fasta ref.fasta Generate PASA run script. """ p = OptionParser(prepare.__doc__) p.set_home("pasa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) cfg, est, ref = args phome = opts.pasa_home cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl") cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus) cmd += " -C -R --ALIGNERS blat,gmap" cmd += " -t {0} -g {1}".format(est, ref) runfile = "run.sh" write_file(runfile, cmd, meta="run script")
def main(): """ %prog scriptname.py create a minimal boilerplate for a new script """ p = OptionParser(main.__doc__) p.add_option("--graphic", default=False, action="store_true", help="Create boilerplate for a graphic script") opts, args = p.parse_args() if len(args) != 1: sys.exit(not p.print_help()) script, = args template = graphic_template if opts.graphic else default_template write_file(script, template, meta="python script") message = "template writes to `{0}`".format(script) if opts.graphic: message = "graphic " + message message = message[0].upper() + message[1:] logging.debug(message)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_aligner(aligner="bwa") p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:], aligner=opts.aligner) # Requires getopts.pl which may be missing download("http://mflib.org/xampp/perl/lib/getopts.pl") cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_aligner(aligner="bwa") p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:], aligner=opts.aligner) # Requires getopts.pl which may be missing download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl") cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) fl_accs = opts.fl_accs cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctid = opts.compreh_pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice cmds = [] # set PASAHOME env variable if preparing shell script if prepare: env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME) cmds.append(env_cmd) if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) cmds.append(accn_extract_cmd) if not prepare: sh(accn_extract_cmd) else: symlink(dnfasta, tfasta) transcripts = tfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: ccpus = 16 if cpus >= 16 else cpus cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus) if prepare: cmds.append(cleancmd) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice), file=aafw) aafw.close() symlink(genome, gfasta) aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta) aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \ " -t {0}".format(transcripts) if fl_accs: symlink(fl_accs, flaccs) aacmd += " -f {0}".format(flaccs) if ggfasta: aacmd += " --TDN {0}".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: cmds.append(aacmd) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov) if prepare: cmds.append(comprehcmd) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts) if prepare: write_file(runfile, "\n".join(cmds)) # initialize run script
def prepare(args): """ %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN-Trinity. If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM as starting point. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_fastq_names() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] paired = opts.paired merge = opts.merge trinity_home = opts.trinity_home hpc_grid_runner_home = opts.hpcgridrunner_home method = "DN" bam = opts.bam if bam and op.exists(bam): bam = op.abspath(bam) method = "GG" pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) cmds = [] # set TRINITY_HOME env variable when preparing shell script env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home) cmds.append(env_cmd) if method == "DN": assert op.exists("../" + inparam) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(trinity_home, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome_guided_bam {0}".format(bam) cmd += " --genome_guided_max_intron {0}".format(opts.max_intron) else: if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --left {0}".format(",".join(f1)) cmd += " --right {0}".format(",".join(f2)) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.grid and opts.grid_conf_file: hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl") hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file) assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file) cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file) if opts.extra: cmd += " {0}".format(opts.extra) cmds.append(cmd) if opts.cleanup: cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \ if method == "DN" else \ 'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")' cmd.append(cleanup_cmd) runfile = "run.sh" write_file(runfile, "\n".join(cmds)) os.chdir(cwd)
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help= "Assemble the first rank only, other libs asm_flags=2 [default: %default]" ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args K = opts.K for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print(block, file=stream) # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: tag = "q" if is_fastq(s) else "f" block += tag + "={0}\n".format(s) print(block, file=sys.stderr) print(block, file=fw) if asm_flags > 2: print(block, file=fw_gc) runfile = "run.sh" scaffold = opts.scaffold bb = 63 if K <= 63 else 127 binary = "SOAPdenovo-{0}mer".format(bb) header = SOAPHEADER.format(opts.cpus, K, binary) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template) fw.close() fw_gc.close()
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ tv = "0.32" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic jar file [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset [default: guess]") p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=15, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=36, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--adapteronly", default=False, action="store_true", help="Only trim adapters with no qv trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") p.add_option("--log", default=None, dest="trimlog", help="Specify a `trimlog` file [default: %default]") p.set_cpus(cpus=4) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path), \ "Couldn't find Trimmomatic jar file at `{0}`".\ format(path) adaptersfile = "adapters.fasta" Adapters = must_open(op.join(datadir, adaptersfile)).read() write_file(adaptersfile, Adapters, skipcheck=True) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) threadsflag = " -threads {0}".format(opts.cpus) if opts.trimlog: trimlog = " -trimlog {0}".format(opts.trimlog) cmd = "java -Xmx4g -jar {0}".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += " SE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += " PE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile) if not opts.adapteronly: cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv) cmd += " MINLEN:{0}".format(opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd)
def prepare(args): """ %prog prepare *.fastq Generate run.sh script to run clc_novo_assemble. """ from itertools import groupby from jcvi.assembly.base import FastqNamings, Library p = OptionParser(prepare.__doc__ + FastqNamings) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) singletons = [] pairs = [] write_file("license.properties", CLCLICENSE, skipcheck=True) for lib, fs in libs: size = lib.size stddev = lib.stddev if size == 0: singletons += fs continue for f in fs: reverse_seq = 0 if ".corr." in f else lib.reverse_seq fb = "bf" if reverse_seq else "fb" minsize, maxsize = size - 2 * stddev, size + 2 * stddev pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize) if ".1." in f: f = f.replace(".1.", ".?.") pairs.append(pair_opt + "-i {0}".format(f)) elif ".2." in f: continue else: pairs.append(pair_opt + f) cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format(opts.cpus) cmd += "\t-q {0} \\\n".format(" ".join(singletons)) cmd += "\n".join("\t{0} \\".format(x) for x in pairs) runfile = "run.sh" write_file(runfile, cmd, meta="run script")
def prepare(args): """ %prog prepare barcode_key.csv reference.fasta Prepare TASSEL pipeline. """ valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \ "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \ "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \ "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") p = OptionParser(prepare.__doc__) p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes, help="Restriction enzyme used [default: %default]") p.set_home("tassel") p.set_aligner(aligner="bwa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) barcode, reference = args thome = opts.tassel_home reference = get_abs_path(reference) folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt", "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec") for f in folders: mkdir(f) # Build the pipeline runsh = [] o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) runsh.append(cmd) o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) runsh.append(cmd) runsh.append("cd mergedTagCounts") cmd = "python -m jcvi.apps.{0} align --cpus {1}".\ format(opts.aligner, opts.cpus) cmd += " {0} myMasterTags.cnt.fq".format(reference) runsh.append(cmd) runsh.append("cd ..") o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" cmd = run_pipeline(thome, "SAMConverterPlugin", o) runsh.append(cmd) o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" o += " -mUpd topm/myMasterTagsWithVariants.topm" o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" o += " -ref {0} -sC 1 -eC 10".format(reference) cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) runsh.append(cmd) o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) runsh.append(cmd) o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" #o += "-hLD -mnR2 0.2 -mnBonP 0.005" cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) runsh.append(cmd) runfile = "run.sh" write_file(runfile, "\n".join(runsh))
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents, meta="run script")
def parallel(args): """ %prog parallel genome.fasta N Partition the genome into parts and run separately. This is useful if MAKER is to be run on the grid. """ from jcvi.formats.base import split p = OptionParser(parallel.__doc__) p.set_home("maker") p.set_tmpdir(tmpdir="tmp") p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) genome, NN = args threaded = opts.threaded or 1 tmpdir = opts.tmpdir mkdir(tmpdir) tmpdir = get_abs_path(tmpdir) N = int(NN) assert 1 <= N < 1000, "Required: 1 < N < 1000!" outdir = "outdir" fs = split([genome, outdir, NN]) c = CTLFile("maker_opts.ctl") c.update_abs_path() if threaded > 1: c.update_tag("cpus", threaded) cwd = os.getcwd() dirs = [] for name in fs.names: fn = get_abs_path(name) bn = op.basename(name) dirs.append(bn) c.update_tag("genome", fn) mkdir(bn) sh("cp *.ctl {0}".format(bn)) os.chdir(bn) c.write_file("maker_opts.ctl") os.chdir(cwd) jobs = "jobs" fw = open(jobs, "w") print("\n".join(dirs), file=fw) fw.close() # Submit to grid ncmds = len(dirs) runfile = "array.sh" cmd = op.join(opts.maker_home, "bin/maker") if tmpdir: cmd += " -TMP {0}".format(tmpdir) engine = get_grid_engine() contents = arraysh.format(jobs, cmd) if engine == "SGE" \ else arraysh_ua.format(N, threaded, jobs, cmd) write_file(runfile, contents) if engine == "PBS": return # qsub script outfile = "maker.\$TASK_ID.out" p = GridProcess(runfile, outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) qsubfile = "qsub.sh" qsub = p.build() write_file(qsubfile, qsub)
def align(args): """ %prog align reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(align.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") p.add_option("--fraction", default=0.5, help="Fraction of the read that must match [default: %default]") p.add_option("--similarity", default=0.95, help="Similarity of the matching region [default: %default]") p.set_params() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE, skipcheck=True) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus {0}".format(opts.cpus) cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity) sh(cmd) return outfile, None
def prepare(args): """ %prog prepare *.fastq Generate run.sh script to run clc_novo_assemble. """ from itertools import groupby from jcvi.assembly.base import FastqNamings, Library p = OptionParser(prepare.__doc__ + FastqNamings) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) singletons = [] pairs = [] write_file("license.properties", CLCLICENSE, skipcheck=True) for lib, fs in libs: size = lib.size stddev = lib.stddev if size == 0: singletons += fs continue for f in fs: reverse_seq = 0 if ".corr." in f else lib.reverse_seq fb = "bf" if reverse_seq else "fb" minsize, maxsize = size - 2 * stddev, size + 2 * stddev pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize) if ".1." in f: f = f.replace(".1.", ".?.") pairs.append(pair_opt + "-i {0}".format(f)) elif ".2." in f: continue else: pairs.append(pair_opt + f) cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format(opts.cpus) cmd += "\t-q {0} \\\n".format(" ".join(singletons)) cmd += "\n".join("\t{0} \\".format(x) for x in pairs) runfile = "run.sh" write_file(runfile, cmd)
def maker(args): """ %prog maker maker.gff3 genome.fasta Prepare EVM inputs by separating tracks from MAKER. """ from jcvi.formats.base import SetFile, FileShredder A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN" # Stores default weights and types Registry = {\ "maker": (A, 5), "augustus_masked": (A, 1), "snap_masked": (A, 1), "genemark": (A, 1), "est2genome": (T, 5), "est_gff": (T, 5), "protein2genome": (P, 5), "blastx": (P, 1) } p = OptionParser(maker.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, fastafile = args types = "type.ids" if need_update(gffile, types): cmd = "cut -f2 -s {0} | sort -u".format(gffile) sh(cmd, outfile=types) types = SetFile(types) reg = defaultdict(list) weightsfile = "weights.txt" contents = [] for s in types: rs = s.split(":")[0] if rs not in Registry: continue type, weight = Registry[rs] reg[type].append(s) contents.append("\t".join(str(x) for x in (type, s, weight))) contents = "\n".join(sorted(contents)) write_file(weightsfile, contents, meta="weights file") evs = [x + ".gff" for x in (A, T, P)] FileShredder(evs) for type, tracks in reg.items(): for t in tracks: cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type) sh(cmd) partition(evs) runfile = "run.sh" contents = EVMRUN.format(*evs) write_file(runfile, contents, meta="run script")
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append( 'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def prepare(args): """ %prog prepare barcode_key.csv reference.fasta Prepare TASSEL pipeline. """ valid_enzymes = "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" \ "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" \ "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" \ "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") p = OptionParser(prepare.__doc__) p.add_option("--enzyme", default="ApeKI", choices=valid_enzymes, help="Restriction enzyme used [default: %default]") p.set_home("tassel") p.set_aligner(aligner="bwa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) barcode, reference = args thome = opts.tassel_home reference = get_abs_path(reference) folders = ("fastq", "tagCounts", "mergedTagCounts", "topm", "tbt", "mergedTBT", "hapmap", "hapmap/raw", "hapmap/mergedSNPs", "hapmap/filt", "hapmap/bpec") for f in folders: mkdir(f) # Build the pipeline runsh = [] o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) runsh.append(cmd) o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) runsh.append(cmd) runsh.append("cd mergedTagCounts") cmd = "python -m jcvi.apps.{0} align --cpus {1}".\ format(opts.aligner, opts.cpus) cmd += " {0} myMasterTags.cnt.fq".format(reference) runsh.append(cmd) runsh.append("cd ..") o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" cmd = run_pipeline(thome, "SAMConverterPlugin", o) runsh.append(cmd) o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" o += " -mUpd topm/myMasterTagsWithVariants.topm" o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" o += " -ref {0} -sC 1 -eC 10".format(reference) cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) runsh.append(cmd) o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) runsh.append(cmd) o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" #o += "-hLD -mnR2 0.2 -mnBonP 0.005" cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) runsh.append(cmd) runfile = "run.sh" write_file(runfile, "\n".join(runsh), meta="run script")
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] assert op.exists(inparam) genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format( opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format( genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) cmd += " --bypass_java_version_check" runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def snpflow(args): """ %prog snpflow trimmed reference.fasta Run SNP calling pipeline until allele_counts are generated. This includes generation of native files, SNP_Het file. Speedup for fragmented genomes are also supported. """ p = OptionParser(snpflow.__doc__) p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, ref = args nseqs = len(Fasta(ref)) supercat = nseqs >= 1000 if supercat: logging.debug("Total seqs in ref: {0} (supercat={1})".\ format(nseqs, supercat)) reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure nativedir, countsdir = "native", "allele_counts" for d in (nativedir, countsdir): mkdir(d) mm = MakeManager() # Step 0 - index database db = op.join(*check_index(ref, supercat=supercat, go=False)) cmd = "python -m jcvi.apps.gmap index {0}".format(ref) if supercat: cmd += " --supercat" coordsfile = db + ".coords" supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta" mm.add(ref, (db, coordsfile), cmd) else: mm.add(ref, db, cmd) # Step 1 - GSNAP alignment and conversion to native file allnatives = [] allsamstats = [] gmapdb = supercatfile if supercat else ref for f in reads: prefix = get_prefix(f, ref) gsnapfile = op.join(nativedir, prefix + ".gsnap") nativefile = op.join(nativedir, prefix + ".unique.native") samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats") cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f) cmd += " --outdir={0} --native --cpus=1".format(nativedir) mm.add((f, db), nativefile, cmd) cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\ format(gsnapfile, gmapdb) mm.add(nativefile, samstatsfile, cmd) allnatives.append(nativefile) allsamstats.append(samstatsfile) # Step 2 - call SNP discovery if supercat: nativeconverted = nativedir + "-converted" mkdir(nativeconverted) allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives] cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl" cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted) cmd += " -c {0}".format(coordsfile) cmds = ["rm -rf {0}".format(nativeconverted), cmd] mm.add(allnatives + [coordsfile], allnativesc, cmds) runfile = "speedup.sh" write_file(runfile, speedupsh.format(nativeconverted, opts.cpus)) nativedir = nativeconverted allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] mm.add(allnativesc, allsnps, "./{0}".format(runfile)) else: for s in samples: snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s)) cmd = "SNP_Discovery-short.pl" cmd += " -native {0}/{1}.*unique.native".format(nativedir, s) cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile) flist = [x for x in allnatives if op.basename(x).split(".")[0] == s] mm.add(flist, snpfile, cmd) # Step 3 - generate equal file allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] for s in samples: equalfile = op.join(nativedir, "{0}.equal".format(s)) cmd = "extract_reference_alleles.pl" cmd += " --native {0}/{1}.*unique.native".format(nativedir, s) cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s) cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, equalfile) mm.add(allsnps, equalfile, cmd) # Step 4 - generate snp matrix allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples] matrix = "snps.matrix.txt" cmd = "generate_matrix.pl" cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, matrix) mm.add(allsnps + allequals, matrix, cmd) # Step 5 - generate allele counts allcounts = [] for s in samples: allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s)) cmd = "count_reads_per_allele.pl -m snps.matrix.txt" cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir) cmd += " -o {0}".format(allele_counts) mm.add(matrix, allele_counts, cmd) allcounts.append(allele_counts) # Step 6 - generate raw snps rawsnps = "Genotyping.H3.txt" cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3" cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps) cmds = ["rm -f {0}".format(rawsnps), cmd] mm.add(allcounts, rawsnps, cmds) # Step 7 - generate alignment report sam_summary = "sam.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary) mm.add(allsamstats, sam_summary, cmd) native_summary = "native.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary) mm.add(allnatives, native_summary, cmd) mm.write()
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--corr", default=False, action="store_true", help="Extra parameters for corrected data [default: %default]") p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) offset = guessoffset([fnames[0]]) phred64 = offset == 64 assert all(guessoffset([x]) == offset for x in fnames[1:]) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" extra = "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra) write_file(runfile, contents)
def compare(args): """ %prog compare pasa_db_name [--annots_gff3=annotation.gff3] Run the PASA annotation comparison pipeline This assumes that PASA alignment assembly has alredy been completed and run directory contains `genome.fasta` and `transcript.fasta` files. If `--annots_gff3` is specified, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) pasa_db, = args PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) annots_gff3 = opts.annots_gff3 grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "", append=True, skipcheck=True) # initialize run script acfw = must_open(acconf, "w") print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw) acfw.close() if not op.exists(gfasta): sys.exit("Genome fasta file `{0}` does not exist".format(gfasta)) transcripts = tfasta if not op.exists(transcripts): sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts)) if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, gfasta, transcripts, opts.genetic_code) if annots_gff3: if not op.exists(annots_gff3): sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3)) symlink(annots_gff3, annotation) accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help="Assemble the first rank only, other libs asm_flags=2 [default: %default]", ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print >> stream, block # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: block += "q={0}\n".format(s) print >>sys.stderr, block print >> fw, block if asm_flags > 2: print >> fw_gc, block runfile = "run.sh" scaffold = opts.scaffold header = SOAPHEADER.format(opts.cpus, opts.K) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template, meta="run script") fw.close() fw_gc.close()