def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.set_outdir(outdir="outdir") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def minimap(args): """ %prog minimap ref.fasta query.fasta Wrap minimap2 aligner using query against sequences. When query and ref is the same, we are in "self-scan" mode (e.g. useful for finding internal duplications resulted from mis-assemblies). """ from jcvi.apps.grid import MakeManager from jcvi.formats.fasta import Fasta p = OptionParser(minimap.__doc__) p.add_option( "--chunks", type="int", default=2000000, help="Split ref.fasta into chunks of size in self-scan mode", ) p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args chunks = opts.chunks outdir = opts.outdir if ref != query: raise NotImplementedError # "self-scan" mode # build faidx (otherwise, parallel make may complain) sh("samtools faidx {}".format(ref)) f = Fasta(ref) mkdir(outdir) mm = MakeManager() for name, size in f.itersizes(): start = 0 for end in range(chunks, size, chunks): fafile = op.join(outdir, "{}_{}_{}.fa".format(name, start + 1, end)) cmd = "samtools faidx {} {}:{}-{} -o {}".format( ref, name, start + 1, end, fafile) mm.add(ref, fafile, cmd) paffile = fafile.rsplit(".", 1)[0] + ".paf" cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile) mm.add(fafile, paffile, cmd) epsfile = fafile.rsplit(".", 1)[0] + ".eps" cmd = "minidot {} > {}".format(paffile, epsfile) mm.add(paffile, epsfile, cmd) start += chunks mm.write()
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + [ "--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta"), ]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def fasta(args): """ %prog fasta fastqfiles Convert fastq to fasta and qual file. """ p = OptionParser(fasta.__doc__) p.add_option("--seqtk", default=False, action="store_true", help="Use seqtk to convert") p.set_outdir() p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args outdir = opts.outdir if outdir and outdir != ".": mkdir(outdir) fastqfile = fastqfiles[0] pf = op.basename(fastqfile) gzinput = pf.endswith(".gz") if gzinput: pf = pf.rsplit(".", 1)[0] pf, sf = pf.rsplit(".", 1) if sf not in ("fq", "fastq"): logging.debug("Assumed FASTA: suffix not `fq` or `fastq`") return fastqfile, None fastafile, qualfile = pf + ".fasta", pf + ".qual" outfile = opts.outfile or fastafile outfile = op.join(outdir, outfile) if opts.seqtk: if need_update(fastqfiles, outfile): for i, fastqfile in enumerate(fastqfiles): cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile) # First one creates file, following ones append to it sh(cmd, outfile=outfile, append=i) else: logging.debug("Outfile `{0}` already exists.".format(outfile)) return outfile, None for fastqfile in fastqfiles: SeqIO.convert(fastqfile, "fastq", fastafile, "fasta") SeqIO.convert(fastqfile, "fastq", qualfile, "qual") return fastafile, qualfile
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + ["--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta")]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files", ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (srafile, ) = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-files") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end " + "[default: %default]", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]" ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) srafile, = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-3") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length [default: %default]") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".\ format(len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print >> sys.stderr, "Remove {0}".format(c) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--native", default=False, action="store_true", help="Convert GSNAP output to NATIVE format") p.set_home("eddyyeh") p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[:2] outdir = opts.outdir assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = op.join(outdir, prefix + ".log") gsnapfile = op.join(outdir, prefix + ".gsnap") nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.native: EYHOME = opts.eddyyeh_home if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) return gsnapfile, logfile
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (filename,) = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert ( database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt] ) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: printf( "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()), ) return outfile
def last(args, dbtype=None): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option( "--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database", ) p.add_option("--path", help="Specify LAST path") p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb") p.add_option( "--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format", ) p.add_option( "--minlen", default=0, type="int", help="Filter alignments by how many bases match", ) p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_outdir() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus if not dbtype: dbtype = opts.dbtype getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb( infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, lastdb_bin=lastdb_bin, dbtype=dbtype, ) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir) sh(cmd, outfile=lastfile) return lastfile
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") p.add_option("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") p.add_option("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") p.add_option("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") p.add_option("--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") p.add_option("--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: print("A total of {0} {1} records downloaded.". format(totalsize, fmt.upper()), file=sys.stderr) return outfile
def split(args): """ %prog split barcodefile fastqfile1 .. Deconvolute fastq files into subsets of fastq reads, based on the barcodes in the barcodefile, which is a two-column file like: ID01 AGTCCAG Input fastqfiles can be several files. Output files are ID01.fastq, ID02.fastq, one file per line in barcodefile. When --paired is set, the number of input fastqfiles must be two. Output file (the deconvoluted reads) will be in interleaved format. """ p = OptionParser(split.__doc__) p.set_outdir(outdir="deconv") p.add_option("--nocheckprefix", default=False, action="store_true", help="Don't check shared prefix [default: %default]") p.add_option("--paired", default=False, action="store_true", help="Paired-end data [default: %default]") p.add_option("--append", default=False, action="store_true", help="Append barcode to 2nd read [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) barcodefile = args[0] fastqfile = args[1:] paired = opts.paired append = opts.append if append: assert paired, "--append only works with --paired" nfiles = len(fastqfile) barcodes = [] fp = open(barcodefile) for row in fp: id, seq = row.split() for s in unpack_ambiguous(seq): barcodes.append(BarcodeLine._make((id, s))) nbc = len(barcodes) logging.debug("Imported {0} barcodes (ambiguous codes expanded).".format(nbc)) checkprefix = not opts.nocheckprefix if checkprefix: # Sanity check of shared prefix excludebarcodes = [] for bc in barcodes: exclude = [] for s in barcodes: if bc.id == s.id: continue assert bc.seq != s.seq if s.seq.startswith(bc.seq) and len(s.seq) > len(bc.seq): logging.error("{0} shares same prefix as {1}.".format(s, bc)) exclude.append(s) excludebarcodes.append(exclude) else: excludebarcodes = nbc * [[]] outdir = opts.outdir mkdir(outdir) cpus = opts.cpus logging.debug("Create a pool of {0} workers.".format(cpus)) pool = Pool(cpus) if paired: assert nfiles == 2, "You asked for --paired, but sent in {0} files".\ format(nfiles) split_fun = append_barcode_paired if append else split_barcode_paired mode = "paired" else: split_fun = split_barcode mode = "single" logging.debug("Mode: {0}".format(mode)) pool.map(split_fun, \ zip(barcodes, excludebarcodes, nbc * [outdir], nbc * [fastqfile]))
def prepare(args): """ %prog prepare mcscanfile cdsfile [options] Pick sequences from cdsfile to form fasta files, according to multiple alignment in the mcscanfile. The fasta sequences can then be used to construct phylogenetic tree. Use --addtandem=tandemfile to collapse tandems of anchors into single row. The tandemfile must be provided with *ALL* genomes involved, otherwise result will be incomplete and redundant. """ from jcvi.graphics.base import discrete_rainbow p = OptionParser(prepare.__doc__) p.add_option("--addtandem", help="path to tandemfile [default: %default]") p.add_option("--writecolors", default=False, action="store_true", \ help="generate a gene_name to color mapping file which will be taken " \ "by jcvi.apps.phylo.draw [default: %default]") p.set_outdir(outdir="sequences") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mcscanfile, cdsfile = args if opts.addtandem: tandemfile = opts.addtandem mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile) mcscanfile = mcscanfile_with_tandems seqdir = opts.outdir mkdir(seqdir) f = Fasta(cdsfile) fp = must_open(mcscanfile) if opts.writecolors: fc = must_open("leafcolors.txt", "w") n = 0 for i, row in enumerate(fp): row = row.strip().split("\t") if i == 0: l = len(row) if l <= 20: colors = discrete_rainbow(l, shuffle=False)[1] else: colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1] warnings.warn("*** WARNING ***\n" \ "Too many columns. Colors may not be all distinctive.") assert len(row)==l, "All rows should have same number of fields." anchors = set() for j, atom in enumerate(row): color = "%s,%s,%s" % colors[j] if atom == ".": continue elif "," in atom: atom = atom.split(",") for a in atom: fc.write("{0}\t{1}\n".format(a, color)) anchors.add(a) else: fc.write("{0}\t{1}\n".format(atom, color)) anchors.add(atom) if len(anchors) <= 3: print("Not enough seqs to build trees for {0}".format(anchors), file=sys.stderr) continue pivot = row[0] fw = must_open("%s/%s.cds" % (seqdir, pivot), "w") for a in anchors: if a not in f: print(a) a = find_first_isoform(a, f) assert a, a arec = f[a] SeqIO.write((arec), fw, "fasta") fw.close() n+=1 if opts.writecolors: fc.close() logging.debug("leaf colors written to `{0}`".format(fc.name)) logging.debug("cds of {0} syntelog groups written to {1}/".format(n, seqdir)) return seqdir
def draw(args): """ %prog draw --input newicktrees [options] Draw phylogenetic trees into single or combined plots. Input trees should be one of the following: 1. single Newick format tree file 2. a dir containing *ONLY* the tree files to be drawn Newick format: http://evolution.genetics.washington.edu/phylip/newicktree.html This function wraps on jcvi.graphics.tree This function is better used for trees generated by jcvi.apps.phylo (rooted if possible). For drawing general Newick trees from external sources invoke jcvi.graphics.tree directly, which also gives more drawing options. """ trunc_name_options = ['headn', 'oheadn', 'tailn', 'otailn'] p = OptionParser(draw.__doc__) p.add_option("--input", help="path to single input tree file or a dir "\ "containing ONLY the input tree files") p.add_option("--combine", type="string", default="1x1", \ help="combine multiple trees into one plot in nrowxncol") p.add_option("--trunc_name", default=None, help="Options are: {0}. " \ "truncate first n chars, retains only first n chars, " \ "truncate last n chars, retain only last chars. " \ "n=1~99. [default: %default]".format(trunc_name_options)) p.add_option("--SH", default=None, help="path to a file containing SH test p-values in format:" \ "tree_file_name<tab>p-values " \ "This file can be generated with jcvi.apps.phylo build [default: %default]") p.add_option("--scutoff", default=50, type="int", help="cutoff for displaying node support, 0-100 [default: %default]") p.add_option("--barcode", default=None, help="path to seq/taxon name barcode mapping file: " \ "barcode<tab>new_name " \ "This option is downstream of `--trunc_name` [default: %default]") p.add_option("--leafcolorfile", default=None, help="path to a mapping file containing font colors " \ "for the OTUs: leafname<tab>color [default: %default]") p.set_outdir() opts, args, iopts = p.set_image_options(figsize="8x6") input = opts.input outdir = opts.outdir combine = opts.combine.split("x") trunc_name = opts.trunc_name SH = opts.SH mkdir(outdir) if not input: sys.exit(not p.print_help()) elif op.isfile(input): trees_file = input treenames = [op.basename(input)] elif op.isdir(input): trees_file = op.join(outdir, "alltrees.dnd") treenames = [] for f in sorted(os.listdir(input)): sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False) treenames.append(f) else: sys.exit(not p.print_help()) trees = OrderedDict() tree = "" i = 0 for row in LineFile(trees_file, comment="#", load=True).lines: if i == len(treenames): break if not len(row): continue if ";" in row: # sanity check if row.index(";") != len(row)-1: ts = row.split(";") for ii in xrange(len(ts)-1): ts[ii] += ";" else: ts = [row] for t in ts: if ";" in t: tree += t if tree: trees[treenames[i]] = tree tree = "" i+=1 else: tree += t else: tree += row logging.debug("A total of {0} trees imported.".format(len(trees))) sh("rm {0}".format(op.join(outdir, "alltrees.dnd"))) _draw_trees(trees, nrow=int(combine[0]), ncol=int(combine[1]), rmargin=.3,\ iopts=iopts, outdir=outdir, shfile=SH, trunc_name=trunc_name, \ scutoff=opts.scutoff, barcodefile = opts.barcode, leafcolorfile=opts.leafcolorfile)
def build(args): """ %prog build [prot.fasta] cds.fasta [options] --outdir=outdir This function wraps on the following steps: 1. msa using ClustalW2 or MUSCLE(default) 2. (optional) alignment editing using Gblocks 3. build NJ tree using PHYLIP in EMBOSS package seq names should be unique by first 10 chars (restriction of PHYLIP) 4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml, *WARNING* maybe slow with large dataset If an outgroup file is provided, the result tree will be rooted on the outgroup according to order in the file, i.e. the name in row1 will be tried first. If not found, row2 will be used, etc. Tail truncated names can be provided so long as it is unique among the seqs. If not uniq, the first occurrence will be used. For example, if you have two moss sequences in your input, then the tree will be rooted on the first moss sequence encountered by the program, unless they are monophylic, in which case the root will be their common ancestor. --stree and --smap are required if --treefix is set. Trees can be edited again using an editor such as Dendroscope. This is the recommended way to get highly customized trees. Newick format trees will be deposited into outdir (. by default). """ from jcvi.formats.fasta import translate p = OptionParser(build.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option("--nogblocks", action="store_true", help="don't use Gblocks to edit alignment [default: %default]") p.add_option("--synonymous", action="store_true", help="extract synonymous sites of the alignment [default: %default]") p.add_option("--fourfold", action="store_true", help="extract fourfold degenerate sites of the alignment [default: %default]") p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--noneighbor", action="store_true", help="don't build NJ tree [default: %default]") p.add_option("--ml", default=None, choices=("raxml", "phyml"), help="software used to build ML tree [default: %default]") p.add_option("--outgroup", help="path to file containing outgroup orders [default: %default]") p.add_option("--SH", help="path to reference Newick tree [default: %default]") p.add_option("--shout", default="SH_out.txt", \ help="SH output file name [default: %default]") p.add_option("--treefix", action="store_true", help="use TreeFix to rearrange ML tree [default: %default]") p.add_option("--stree", help="path to species Newick tree [default: %default]") p.add_option("--smap", help="path to smap file: " \ "gene_name_pattern<tab>species_name [default: %default]") p.set_outdir() opts, args = p.parse_args(args) gblocks = not opts.nogblocks synonymous = opts.synonymous fourfold = opts.fourfold neighbor = not opts.noneighbor outgroup = opts.outgroup outdir = opts.outdir if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) if opts.treefix: stree = opts.stree smap = opts.smap assert stree and smap, "TreeFix requires stree and smap files." opts.ml = "raxml" treedir = op.join(outdir, "tree") mkdir(treedir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) work_dir = op.join(outdir, "alignment") mkdir(work_dir) p_recs = list(SeqIO.parse(open(protein_file), "fasta")) if opts.msa == "clustalw": align_fasta = clustal_align_protein(p_recs, work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein(p_recs, work_dir) n_recs = list(SeqIO.parse(open(dna_file), "fasta")) mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta") if not mrtrans_fasta: logging.debug("pal2nal aborted. " \ "Cannot reliably build tree for {0}".format(dna_file)) return codon_aln_fasta = mrtrans_fasta if gblocks: gb_fasta = run_gblocks(mrtrans_fasta) codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta else: if synonymous: codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous") if fourfold: codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold") if not neighbor and not opts.ml: return codon_aln_fasta alignment = AlignIO.read(codon_aln_fasta, "fasta") if len(alignment) <= 3: raise ValueError("Too few seqs to build tree.") mkdir(op.join(treedir, "work")) if neighbor: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".NJ.unrooted.dnd") try: outfile, phy_file = build_nj_phylip(alignment, \ outfile=out_file, outgroup=outgroup, work_dir=treedir) except: print("NJ tree cannot be built for {0}".format(dna_file)) if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.ml: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".ML.unrooted.dnd") if opts.ml == "phyml": try: outfile, phy_file = build_ml_phyml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) elif opts.ml == "raxml": try: outfile, phy_file = build_ml_raxml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) if outgroup: new_out_file = out_file.replace(".unrooted", "") t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \ outfile=new_out_file) if t == new_out_file: sh("rm %s" % out_file) outfile = new_out_file if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.treefix: treefix_dir = op.join(treedir, "treefix") assert mkdir(treefix_dir, overwrite=True) sh("cp {0} {1}/".format(outfile, treefix_dir)) input = op.join(treefix_dir, op.basename(outfile)) aln_file = input.rsplit(".", 1)[0] + ".fasta" SeqIO.write(alignment, aln_file, "fasta") outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \ a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd") return outfile