def spades(args): """ %prog spades folder Run automated SPADES. """ from jcvi.formats.fastq import readlen p = OptionParser(spades.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) folder, = args for p, pf in iter_project(folder, 2): rl = readlen([p[0], "--silent"]) # <http://spades.bioinf.spbau.ru/release3.1.0/manual.html#sec3.4> kmers = None if rl >= 150: kmers = "21,33,55,77" elif rl >= 250: kmers = "21,33,55,77,99,127" cmd = "spades.py" if kmers: cmd += " -k {0}".format(kmers) cmd += " --careful" cmd += " --pe1-1 {0} --pe1-2 {1}".format(*p) cmd += " -o {0}_spades".format(pf) print cmd
def prepare(args): """ %prog prepare genomesize *.fastq Prepare MERACULOUS configuation file. Genome size should be entered in Mb. """ p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=51, type="int", help="K-mer size") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) genomesize = float(args[0]) / 1000 fnames = args[1:] for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) s = comment_banner("Meraculous params file") + "\n" s += comment_banner("Basic parameters") + "\n" s += "# Describe the libraries ( one line per library )\n" s += "# " + " ".join(header.split()) + "\n" libs = get_libs(fnames) lib_seqs = [] rank = 0 for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 library_name = lib.library_name name = library_name.replace("-", "") wildcard = "{0}*.1.*,{0}*.2.*".format(library_name) rl = max(readlen([x]) for x in fs) lib_seq = lib.get_lib_seq(wildcard, name, rl, rank) lib_seqs.append(lib_seq) s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n" params = [("genome_size", genomesize), ("is_diploid", 0), ("mer_size", opts.K), ("num_prefix_blocks", 1), ("no_read_validation", 0), ("local_num_procs", opts.cpus)] s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n" cfgfile = "meraculous.config" write_file(cfgfile, s, tee=True) s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\ .format(cfgfile) runsh = "run.sh" write_file(runsh, s)
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help="Assemble the first rank only, other libs asm_flags=2 [default: %default]", ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print >> stream, block # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: block += "q={0}\n".format(s) print >>sys.stderr, block print >> fw, block if asm_flags > 2: print >> fw_gc, block runfile = "run.sh" scaffold = opts.scaffold header = SOAPHEADER.format(opts.cpus, opts.K) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template, meta="run script") fw.close() fw_gc.close()
def prepare(args): """ %prog prepare *.fastq Scan input fastq files (see below) and write SOAP config files based on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding. """ from jcvi.formats.base import write_file p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=45, type="int", help="K-mer size [default: %default]") p.add_option( "--assemble_1st_rank_only", default=False, action="store_true", help= "Assemble the first rank only, other libs asm_flags=2 [default: %default]" ) p.add_option("--scaffold", help="Only perform scaffolding [default: %default]") p.add_option("--gapclose", help="Only perform gap closure [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fnames = args K = opts.K for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) a1st = opts.assemble_1st_rank_only cfgfile = "soap.config" gc_cfgfile = "soap.gc.config" fw = open(cfgfile, "w") fw_gc = open(gc_cfgfile, "w") libs = get_libs(fnames) rank = 0 singletons = [] max_rd_len = max(readlen([f]) for f in fnames) block = "max_rd_len={0}\n".format(max_rd_len) for stream in (sys.stderr, fw, fw_gc): print(block, file=stream) # Collect singletons first singletons = [] for lib, fs in libs: if lib.size == 0: singletons += fs continue for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 block = "[LIB]\n" block += "avg_ins={0}\n".format(size) f = fs[0] block += "reverse_seq={0}\n".format(lib.reverse_seq) asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags block += "asm_flags={0}\n".format(asm_flags) block += "rank={0}\n".format(rank) if lib.reverse_seq: pair_num_cutoff = 3 block += "pair_num_cutoff={0}\n".format(pair_num_cutoff) block += "map_len=35\n" for f in fs: if ".1." in f: tag = "q1" elif ".2." in f: tag = "q2" block += "{0}={1}\n".format(tag, f) if rank == 1: for s in singletons: tag = "q" if is_fastq(s) else "f" block += tag + "={0}\n".format(s) print(block, file=sys.stderr) print(block, file=fw) if asm_flags > 2: print(block, file=fw_gc) runfile = "run.sh" scaffold = opts.scaffold bb = 63 if K <= 63 else 127 binary = "SOAPdenovo-{0}mer".format(bb) header = SOAPHEADER.format(opts.cpus, K, binary) if opts.gapclose: gapclose = opts.gapclose outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta" template = header + GCRUNG.format(gapclose, outfile) else: template = header + (SCFRUN % scaffold if scaffold else SOAPRUN) write_file(runfile, template) fw.close() fw_gc.close()
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset, readlen p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option( "--corr", default=False, action="store_true", help="Extra parameters for corrected data", ) p.add_option( "--norun", default=False, action="store_true", help="Don't write `run.sh` script", ) p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) groupheader = "group_name library_name file_name".split() libheader = ( "library_name project_name organism_name type paired " "frag_size frag_stddev insert_size insert_stddev read_orientation " "genomic_start genomic_end".split()) groups_33 = [] groups_64 = [] libs = [] for file_name in fnames: offset = guessoffset([file_name]) group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupscontents = groups_64 if offset == 64 else groups_33 groupscontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append(( library_name, project_name, organism_name, type, paired, frag_size, frag_stddev, insert_size, insert_stddev, read_orientation, genomic_start, genomic_end, )) for groups, csvfile in ( (groups_33, "in_groups_33.csv"), (groups_64, "in_groups_64.csv"), (groups_33 + groups_64, "in_groups.csv"), ): if not groups: continue write_csv(groupheader, groups, filename=csvfile, tee=True) logging.debug("`{0}` created (# of groups = {1}).".format( csvfile, len(groups))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".format( len(libcontents))) runfile = "run.sh" # ALLPATHS stalls on reads over 250bp <https://www.biostars.org/p/122091/> max_rd_len = max(readlen([f]) for f in fnames) extra = "CLOSE_UNIPATH_GAPS=False " if max_rd_len > 200 else "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, extra) write_file(runfile, contents)
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset, readlen p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--corr", default=False, action="store_true", help="Extra parameters for corrected data [default: %default]") p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groups_33 = [] groups_64 = [] libs = [] for file_name in fnames: offset = guessoffset([file_name]) group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupscontents = groups_64 if offset == 64 else groups_33 groupscontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) for groups, csvfile in ((groups_33, "in_groups_33.csv"), \ (groups_64, "in_groups_64.csv"), \ (groups_33 + groups_64, "in_groups.csv")): if not groups: continue write_csv(groupheader, groups, filename=csvfile, tee=True) logging.debug("`{0}` created (# of groups = {1}).".\ format(csvfile, len(groups))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" # ALLPATHS stalls on reads over 250bp <https://www.biostars.org/p/122091/> max_rd_len = max(readlen([f]) for f in fnames) extra = "CLOSE_UNIPATH_GAPS=False " if max_rd_len > 200 else "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, extra) write_file(runfile, contents)