def soap_trios(p, pf, tag, extra): """ Take one pair of reads and 'widow' reads after correction and run SOAP. """ from jcvi.assembly.soap import prepare logging.debug("Work on {0} ({1})".format(pf, ",".join(p))) asm = "{0}.closed.scafSeq".format(pf) if not need_update(p, asm): logging.debug("Assembly found: {0}. Skipped.".format(asm)) return slink(p, pf, tag, extra) cwd = os.getcwd() os.chdir(pf) prepare( sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--assemble_1st_rank_only", "-K 31"] ) sh("./run.sh") sh("cp asm31.closed.scafSeq ../{0}".format(asm)) logging.debug("Assembly finished: {0}".format(asm)) os.chdir(cwd)
def correct_pairs(p, pf, tag): """ Take one pair of reads and correct to generate *.corr.fastq. """ from jcvi.assembly.preprocess import correct as cr logging.debug("Work on {0} ({1})".format(pf, ','.join(p))) itag = tag[0] cm = ".".join((pf, itag)) targets = (cm + ".1.corr.fastq", cm + ".2.corr.fastq", \ pf + ".PE-0.corr.fastq") if not need_update(p, targets): logging.debug("Corrected reads found: {0}. Skipped.".format(targets)) return slink(p, pf, tag) cwd = os.getcwd() os.chdir(pf) cr(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--nofragsdedup"]) sh("mv {0}.1.corr.fastq ../{1}".format(itag, targets[0])) sh("mv {0}.2.corr.fastq ../{1}".format(itag, targets[1])) sh("mv frag_reads_corr.corr.fastq ../{0}".format(targets[2])) logging.debug("Correction finished: {0}".format(targets)) os.chdir(cwd)
def get_info(): infofiles = glob("*.info") info = {} for row in must_open(infofiles): a = row.split()[0] info[a] = row.rstrip() return info
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.add_option("--outdir", default="outdir", help="Output final reads in [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def trace(args): """ %prog trace unitig{version}.{partID}.{unitigID} Call `grep` to get the erroneous fragment placement. """ p = OptionParser(trace.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) s, = args version, partID, unitigID = get_ID(s) flist = glob("../5-consensus/*_{0:03d}.err".format(int(partID))) assert len(flist) == 1 fp = open(flist[0]) instate = False for row in fp: if working in row and unitigID in row: rows = [] instate = True if instate: rows.append(row) if failed in row: instate = False if len(rows) > 20: ignore_line = "... ({0} lines skipped)\n".format(len(rows) - 20) rows = rows[:10] + [ignore_line] + rows[-10:] print >> sys.stderr, "".join(rows)
def tracedb(args): """ %prog tracedb <xml|lib|frg> Run `tracedb-to-frg.pl` within current folder. """ p = OptionParser(tracedb.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) action, = args assert action in ("xml", "lib", "frg") CMD = "tracedb-to-frg.pl" xmls = glob("xml*") if action == "xml": for xml in xmls: cmd = CMD + " -xml {0}".format(xml) sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True) elif action == "lib": cmd = CMD + " -lib {0}".format(" ".join(xmls)) sh(cmd) elif action == "frg": for xml in xmls: cmd = CMD + " -frg {0}".format(xml) sh(cmd, background=True)
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args os.chdir(folder) bams = glob("*tophat/accepted_hits.bam") for bam in bams: pf, ab = op.split(bam) outdir = op.join(pf, "cufflinks") if op.exists(outdir): logging.debug("Directory {0} found. Skipping.".format(outdir)) continue cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(opts.cpus) if opts.gtf: cmd += " -g {0}".format(opts.gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) sh(cmd)
def trace(args): """ %prog trace unitig{version}.{partID}.{unitigID} Call `grep` to get the erroneous fragment placement. """ p = OptionParser(trace.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) s, = args version, partID, unitigID = get_ID(s) flist = glob("../5-consensus/*_{0:03d}.err".format(int(partID))) assert len(flist) == 1 fp = open(flist[0]) instate = False for row in fp: if working in row and unitigID in row: rows = [] instate = True if instate: rows.append(row) if failed in row: instate = False if len(rows) > 20: ignore_line = "... ({0} lines skipped)\n".format( len(rows) - 20) rows = rows[:10] + [ignore_line] + rows[-10:] print >> sys.stderr, "".join(rows)
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.set_outdir(outdir="outdir") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) ahrd_weights = {"blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0]} blast_progs = tuple(ahrd_weights.keys()) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") p.add_option("--blastprog", default="blastp", choices=blast_progs, help="Specify the blast program being run. Based on this option," \ + " the AHRD parameters (score_weights) will be modified." \ + " [default: %default]") p.add_option("--iprscan", default=None, help="Specify path to InterProScan results file if available." \ + " If specified, the yml conf file will be modified" \ + " appropriately. [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog] for f in glob("{0}/*.fa*".format(splits)): fb = op.basename(f).rsplit(".", 1)[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else "" print(Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro), file=fw) if opts.iprscan: if not op.lexists("interpro.xml"): symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml") if not op.lexists("interpro.dtd"): symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
def get_prefix(dir="../"): """ Look for prefix.gkpStore in the upper directory. """ prefix = glob(dir + "*.gkpStore")[0] prefix = op.basename(prefix).rsplit(".", 1)[0] return prefix
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) ahrd_weights = { "blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0] } blast_progs = tuple(ahrd_weights.keys()) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") p.add_option("--blastprog", default="blastp", choices=blast_progs, help="Specify the blast program being run. Based on this option," \ + " the AHRD parameters (score_weights) will be modified." \ + " [default: %default]") p.add_option("--iprscan", default=None, help="Specify path to InterProScan results file if available." \ + " If specified, the yml conf file will be modified" \ + " appropriately. [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog] for f in glob("{0}/*.fasta".format(splits)): fb = op.basename(f).rsplit(".", 1)[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else "" print >> fw, Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro) if opts.iprscan: if not op.lexists("interpro.xml"): symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml") if not op.lexists("interpro.dtd"): symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
def get_weights(weightsfiles=None): if weightsfiles is None: weightsfiles = glob("*.weights") weights = defaultdict(list) for row in must_open(weightsfiles): a, b, c = row.split() weights[a].append((a, b, c)) return weights
def __init__(self, fig, root, canvas, chr, xlim, datadir, order=None, hlsuffix=None, palette=None, cap=50, gauge="bottom", plot_label=True, plot_chr_label=True, gauge_step=5000000, vlines=None): x, y, w, h = canvas p = .01 root.add_patch(Rectangle((x - p, y - p), w + 2 * p, h + 2 * p, lw=1, fill=False, ec="darkslategray", zorder=10)) datafiles = glob(op.join(datadir, chr + "*")) ntracks = len(datafiles) yinterval = h / ntracks yy = y + h if palette is None: # Get the palette import brewer2mpl set2 = brewer2mpl.get_map('Set2', 'qualitative', ntracks).mpl_colors else: set2 = [palette] * ntracks if order: datafiles.sort(key=lambda x: order.index(x.split(".")[1])) if gauge == "top": gauge_ax = fig.add_axes([x, yy + p, w, .0001]) adjust_spines(gauge_ax, ["top"]) tpos = yy + .07 elif gauge == "bottom": gauge_ax = fig.add_axes([x, y - p, w, .0001]) adjust_spines(gauge_ax, ["bottom"]) tpos = y - .07 start, end = xlim fs = gauge_step < 1000000 setup_gauge_ax(gauge_ax, start, end, gauge_step, float_formatter=fs) if plot_chr_label: root.text(x + w / 2, tpos, chr, ha="center", va="center", color="darkslategray", size=16) for label, datafile, c in zip(order, datafiles, set2): yy -= yinterval ax = fig.add_axes([x, yy, w, yinterval * .9]) xy = XYtrack(ax, datafile, color=c) xy.interpolate(end) xy.cap(ymax=cap) if vlines: xy.vlines(vlines) if hlsuffix: hlfile = op.join(datadir, ".".join((label, hlsuffix))) xy.import_hlfile(hlfile, chr) if plot_label: root.text(x - .035, yy + yinterval / 2, label, ha="center", va="center", color=c) xy.draw() ax.set_xlim(*xlim)
def iter_project(folder, n=2): # Check for paired reads and extract project id filelist = [x for x in glob(folder + "/*.*") if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def assemble_dir(pf, target, ploidy="1"): from jcvi.assembly.allpaths import prepare logging.debug("Work on {0}".format(pf)) asm = [x.replace("final", pf) for x in target] if not need_update(pf, asm): logging.debug("Assembly found: {0}. Skipped.".format(asm)) return cwd = os.getcwd() os.chdir(pf) prepare([pf] + sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--ploidy={0}".format(ploidy)]) sh("./run.sh") for a, t in zip(asm, target): sh("cp allpaths/ASSEMBLIES/run/{0} ../{1}".format(t, a)) logging.debug("Assembly finished: {0}".format(asm)) os.chdir(cwd)
def iter_project(folder, n=2): # Check for paired reads and extract project id filelist = [x for x in glob(folder + "/*.*") \ if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def prepare(args): """ %prog prepare countfolder families Parse list of count files and group per family into families folder. """ p = OptionParser(prepare.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) counts, families = args countfiles = glob(op.join(counts, "*.count")) countsdb = defaultdict(list) for c in countfiles: rs = RiceSample(c) countsdb[(rs.tissue, rs.ind)].append(rs) # Merge duplicates - data sequenced in different batches key = lambda x: (x.label, x.rep) for (tissue, ind), rs in sorted(countsdb.items()): rs.sort(key=key) nrs = len(rs) for i in xrange(nrs): ri = rs[i] if not ri.working: continue for j in xrange(i + 1, nrs): rj = rs[j] if key(ri) != key(rj): continue ri.merge(rj) rj.working = False countsdb[(tissue, ind)] = [x for x in rs if x.working] # Group into families mkdir("families") for (tissue, ind), r in sorted(countsdb.items()): r = list(r) if r[0].label != "F1": continue P1, P2 = r[0].P1, r[0].P2 P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)] rs = P1 + P2 + r groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r) assert len(rs) == len(groups) outfile = "-".join((tissue, ind)) merge_counts(rs, op.join(families, outfile)) groupsfile = outfile + ".groups" fw = open(op.join(families, groupsfile), "w") print >> fw, ",".join(str(x) for x in groups) fw.close()
def assemble_dir(pf, target, ploidy="1"): from jcvi.assembly.allpaths import prepare logging.debug("Work on {0}".format(pf)) asm = [x.replace("final", pf) for x in target] if not need_update(pf, asm): logging.debug("Assembly found: {0}. Skipped.".format(asm)) return cwd = os.getcwd() os.chdir(pf) prepare([pf] + sorted(glob("*.fastq") + glob("*.fastq.gz")) + \ ["--ploidy={0}".format(ploidy)]) sh("./run.sh") for a, t in zip(asm, target): sh("cp allpaths/ASSEMBLIES/run/{0} ../{1}".format(t, a)) logging.debug("Assembly finished: {0}".format(asm)) os.chdir(cwd)
def get_edges(weightsfiles=None): if weightsfiles is None: weightsfiles = glob("*.weights") edges = {} for row in must_open(weightsfiles): a, b, c = row.split() c = int(c) edges[(a, b)] = c edges[(b, a)] = c return edges
def preparegb(p, args): p.add_option("--gb_dir", default=None, help="path to dir containing GanBank files (.gb)") p.add_option( "--id", default=None, help="GenBank accession IDs in a file. One ID per row, or all IDs" " in one row comma separated.", ) p.add_option( "--simple", default=None, type="string", help="GenBank accession IDs comma separated " "(for lots of IDs please use --id instead).", ) p.add_option( "--individual", default=False, action="store_true", help="parse gb accessions individually", ) opts, args = p.parse_args(args) accessions = opts.id filenames = opts.gb_dir if not (opts.gb_dir or opts.id or opts.simple): sys.exit(not p.print_help()) if opts.gb_dir: filenames = glob(opts.gb_dir + "/*.gb") if opts.id: rows = open(opts.id).readlines() accessions = [] for row in rows: accessions += map(str.strip, row.strip().split(",")) if opts.simple: accessions = opts.simple.split(",") if opts.id or opts.simple: fw = must_open("GenBank_accession_IDs.txt", "w") for atom in accessions: print(atom, file=fw) fw.close() idfile = fw.name else: idfile = None return filenames, accessions, idfile, opts, args
def soap_trios(p, pf, tag, extra): """ Take one pair of reads and 'widow' reads after correction and run SOAP. """ from jcvi.assembly.soap import prepare logging.debug("Work on {0} ({1})".format(pf, ",".join(p))) asm = "{0}.closed.scafSeq".format(pf) if not need_update(p, asm): logging.debug("Assembly found: {0}. Skipped.".format(asm)) return slink(p, pf, tag, extra) cwd = os.getcwd() os.chdir(pf) prepare(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--assemble_1st_rank_only", "-K 31"]) sh("./run.sh") sh("cp asm31.closed.scafSeq ../{0}".format(asm)) logging.debug("Assembly finished: {0}".format(asm)) os.chdir(cwd)
def get_libs(args): from itertools import groupby fnames = args or glob("*.fastq*") fnames = sorted(fnames) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) return libs
def dn(args): """ %prog dn folder Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain "_1_" and "_2_". """ p = OptionParser(dn.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.set_home("trinity") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args paired = opts.paired thome = opts.trinity_home tfolder = folder + "_DN" cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = glob("../" + folder + "/*") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x] assert len(f1) == len(f2) r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: r = "single.fastq" reads = ((flist, r), ) for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity.pl") cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus) if paired: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --single {0}".format(reads[0][-1]) runfile = "run.sh" write_file(runfile, cmd, meta="run script") os.chdir(cwd)
def error(args): """ %prog error version backup_folder Find all errors in ../5-consensus/*.err and pull the error unitigs into backup/ folder. """ p = OptionParser(error.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) version, backup_folder = args mkdir(backup_folder) fw = open("errors.log", "w") seen = set() for g in glob("../5-consensus/*.err"): if "partitioned" in g: continue fp = open(g) partID = op.basename(g).rsplit(".err", 1)[0] partID = int(partID.split("_")[-1]) for row in fp: if row.startswith(working): unitigID = row.split("(")[0].split()[-1] continue if not failed.upper() in row.upper(): continue uu = (version, partID, unitigID) if uu in seen: continue seen.add(uu) print >> fw, "\t".join(str(x) for x in (partID, unitigID)) s = [str(x) for x in uu] unitigfile = pull(s) cmd = "mv {0} {1}".format(unitigfile, backup_folder) sh(cmd) fp.close() logging.debug("A total of {0} unitigs saved to {1}.".\ format(len(seen), backup_folder))
def error(args): """ %prog error version backup_folder Find all errors in ../5-consensus/*.err and pull the error unitigs into backup/ folder. """ p = OptionParser(error.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) version, backup_folder = args mkdir(backup_folder) fw = open("errors.log", "w") seen = set() for g in glob("../5-consensus/*.err"): if "partitioned" in g: continue fp = open(g) partID = op.basename(g).rsplit(".err", 1)[0] partID = int(partID.split("_")[-1]) for row in fp: if row.startswith(working): unitigID = row.split("(")[0].split()[-1] continue if not failed.upper() in row.upper(): continue uu = (version, partID, unitigID) if uu in seen: continue seen.add(uu) print("\t".join(str(x) for x in (partID, unitigID)), file=fw) s = [str(x) for x in uu] unitigfile = pull(s) cmd = "mv {0} {1}".format(unitigfile, backup_folder) sh(cmd) fp.close() logging.debug("A total of {0} unitigs saved to {1}.".\ format(len(seen), backup_folder))
def get_libs(args): from itertools import groupby fnames = args or glob("*.fastq*") fnames = sorted(fnames) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) library_name = lambda x: "-".join(\ op.basename(x).split(".")[0].split("-")[:2]) libs = [(Library(x), sorted(fs)) for x, fs in \ groupby(fnames, key=library_name)] libs.sort(key=lambda x: x[0].size) return libs
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.softlink import get_abs_path from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.add_option("--sep", default="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmds = [] cmds.append("rm {0}".format(target)) cmds.append("samtools merge {0} {1}".format(target, source)) mm.add(files, target, cmds) mm.write()
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.set_sep(sep="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmd = "samtools merge -@ 8 {0} {1}".format(target, source) mm.add(files, target, cmd, remove=True) mm.write()
def merge(args): """ %prog merge outdir output.gff Follow-up command after grid jobs are completed after parallel(). """ from jcvi.formats.gff import merge as gmerge p = OptionParser(merge.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, outputgff = args fsnames, suffix = get_fsnames(outdir) nfs = len(fsnames) cmd = op.join(opts.maker_home, "bin/gff3_merge") outfile = "merge.sh" write_file(outfile, mergesh.format(suffix, cmd)) # Generate per split directory # Note that gff3_merge write to /tmp, so I limit processes here to avoid # filling up disk space sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames)) # One final output gffnames = glob("*.all.gff") assert len(gffnames) == nfs # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area gfflist = "gfflist" fw = open(gfflist, "w") print("\n".join(gffnames), file=fw) fw.close() nlines = sum(1 for x in open(gfflist)) assert nlines == nfs # Be extra, extra careful to include all results gmerge([gfflist, "-o", outputgff]) logging.debug("Merged GFF file written to `{0}`".format(outputgff))
def assemble(args): """ %prog assemble sffdir Assemble each BAC separately using newbler. """ from jcvi.formats.fasta import join p = OptionParser(assemble.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="Overwrite the separate BAC assembly [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) sffdir, = args asmdir = "newbler" fastadir = "fasta" mkdir(asmdir, overwrite=opts.overwrite) mkdir(fastadir, overwrite=opts.overwrite) cmd = "runAssembly -cpu 8 -o {0} {1}" for sffile in glob("{0}/*.sff".format(sffdir)): pf = op.basename(sffile).split(".")[1] pf = pf.lower() outdir = op.join(asmdir, pf) if op.exists(outdir): logging.debug("`{0}` exists. Ignored.".format(outdir)) continue acmd = cmd.format(outdir, sffile) sh(acmd) ctgfile = op.join(outdir, "454LargeContigs.fna") if not op.exists(ctgfile): # newbler failure logging.error("File `{0}` not found (newbler failure).".\ format(ctgfile)) continue outfile = op.join(fastadir, "{0}.fasta".format(pf)) newidopt = "--newid={0}".format(pf) minctgsizeopt = "--minctgsize=200" join([ctgfile, outfile, newidopt, minctgsizeopt])
def omgparse(args): """ %prog omgparse work Parse the OMG outputs to get gene lists. """ p = OptionParser(omgparse.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) work, = args omgfiles = glob(op.join(work, "gf*.out")) for omgfile in omgfiles: omg = OMGFile(omgfile) best = omg.best() for bb in best: genes, taxa = zip(*bb) print "\t".join((",".join(genes), ",".join(taxa)))
def __init__(self, filenames=None, accessions=None, idfile=None): self.accessions = accessions self.idfile = idfile if filenames is not None: self.accessions = [op.basename(f).split(".")[0] for f in filenames] d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \ for f in filenames) for (k, v) in d.iteritems(): self[k.split(".")[0]] = v elif idfile is not None: gbdir = self._get_records() d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \ for f in glob(gbdir+"/*.gb")) for (k, v) in d.iteritems(): self[k.split(".")[0]] = v else: sys.exit("GenBank object is initiated from either gb files or "\ "accession IDs.")
def preparegb(p, args): p.add_option("--gb_dir", default=None, help="path to dir containing GanBank files (.gb)") p.add_option("--id", default=None, help="GenBank accession IDs in a file. One ID per row, or all IDs" \ " in one row comma separated.") p.add_option("--simple", default=None, type="string", help="GenBank accession IDs comma separated " \ "(for lots of IDs please use --id instead).") p.add_option("--individual", default=False, action="store_true", help="parse gb accessions individually [default: %default]") opts, args = p.parse_args(args) accessions = opts.id filenames = opts.gb_dir if not (opts.gb_dir or opts.id or opts.simple): sys.exit(not p.print_help()) if opts.gb_dir: filenames = glob(opts.gb_dir+"/*.gb") if opts.id: rows = file(opts.id).readlines() accessions = [] for row in rows: accessions += map(str.strip, row.strip().split(",")) if opts.simple: accessions = opts.simple.split(",") if opts.id or opts.simple: fw = must_open("GenBank_accession_IDs.txt", "w") for atom in accessions: print >>fw, atom fw.close() idfile = fw.name else: idfile=None return (filenames, accessions, idfile, opts, args)
def draw_tree( ax, t, hpd=None, margin=0.1, rmargin=0.2, tip=0.01, treecolor="k", supportcolor="k", internal=True, outgroup=None, dashedoutgroup=False, reroot=True, gffdir=None, sizes=None, trunc_name=None, SH=None, scutoff=0, leafcolor="k", leaffont=12, leafinfo=None, wgdinfo=None, geoscale=False, ): """ main function for drawing phylogenetic tree """ if reroot: if outgroup: R = t.get_common_ancestor(*outgroup) else: # Calculate the midpoint node R = t.get_midpoint_outgroup() if R is not t: t.set_outgroup(R) # By default, the distance to outgroup and non-outgroup is the same # we re-adjust the distances so that the outgroups will appear # farthest from everything else if dashedoutgroup: a, b = t.children # Avoid even split total = a.dist + b.dist newR = t.get_common_ancestor(*outgroup) a.dist = 0.9 * total b.dist = total - a.dist farthest, max_dist = t.get_farthest_leaf() print("max_dist = {}".format(max_dist), file=sys.stderr) xstart = margin ystart = 2 * margin # scale the tree scale = (1 - margin - rmargin) / max_dist def rescale(dist): return xstart + scale * dist def rescale_divergence(divergence): return rescale(max_dist - divergence) num_leaves = len(t.get_leaf_names()) yinterval = (1 - ystart) / num_leaves # get exons structures, if any structures = {} if gffdir: gffiles = glob("{0}/*.gff*".format(gffdir)) setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True) structures = dict((a, (b, c)) for a, b, c in setups) if sizes: sizes = Sizes(sizes).mapping coords = {} i = 0 for n in t.traverse("postorder"): dist = n.get_distance(t) xx = rescale(dist) if n.is_leaf(): yy = ystart + i * yinterval i += 1 if trunc_name: name = truncate_name(n.name, rule=trunc_name) else: name = n.name if leafinfo and n.name in leafinfo: line = leafinfo[n.name] lc = line.color sname = line.new_name else: lc = leafcolor sname = None lc = lc or "k" sname = sname or name.replace("_", "-") # if color is given as "R,G,B" if "," in lc: lc = [float(x) for x in lc.split(",")] ax.text( xx + tip, yy, markup(sname), va="center", fontstyle="italic", size=leaffont, color=lc, ) gname = n.name.split("_")[0] if gname in structures: mrnabed, cdsbeds = structures[gname] ExonGlyph( ax, 1 - rmargin / 2, yy, mrnabed, cdsbeds, align="right", ratio=ratio, ) if sizes and gname in sizes: size = sizes[gname] size = size / 3 - 1 # base pair converted to amino acid size = "{0}aa".format(size) ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont) else: linestyle = "--" if (dashedoutgroup and n is t) else "-" children = [coords[x] for x in n.get_children()] children_x, children_y = zip(*children) min_y, max_y = min(children_y), max(children_y) # plot the vertical bar ax.plot((xx, xx), (min_y, max_y), linestyle, color=treecolor) # plot the horizontal bar for cx, cy in children: ax.plot((xx, cx), (cy, cy), linestyle, color=treecolor) yy = sum(children_y) * 1.0 / len(children_y) # plot HPD if exists if hpd and n.name in hpd: a, b = hpd[n.name] ax.plot( (rescale_divergence(a), rescale_divergence(b)), (yy, yy), "-", color="darkslategray", alpha=0.4, lw=2, ) support = n.support if support > 1: support = support / 100.0 if not n.is_root() and supportcolor: if support > scutoff / 100.0: ax.text( xx, yy + 0.005, "{0:d}".format(int(abs(support * 100))), ha="right", size=leaffont, color=supportcolor, ) if internal and n.name: TextCircle(ax, xx, yy, n.name, size=9) coords[n] = (xx, yy) # WGD info draw_wgd(ax, yy, rescale_divergence, n.name, wgdinfo) # scale bar if geoscale: draw_geoscale(ax, margin=margin, rmargin=rmargin, yy=margin, max_dist=max_dist) else: br = 0.1 x1 = xstart + 0.1 x2 = x1 + br * scale yy = margin ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x1, x2], [yy, yy], "-", color=treecolor) ax.text( (x1 + x2) / 2, yy - tip, "{0:g}".format(br), va="top", ha="center", size=leaffont, color=treecolor, ) if SH is not None: xs = x1 ys = (margin + yy) / 2.0 ax.text( xs, ys, "SH test against ref tree: {0}".format(SH), ha="left", size=leaffont, color="g", ) normalize_axes(ax)
def draw_tree(ax, tx, rmargin=.3, leafcolor="k", supportcolor="k", outgroup=None, reroot=True, gffdir=None, sizes=None, trunc_name=None, SH=None, scutoff=0, barcodefile=None, leafcolorfile=None, leaffont=12): """ main function for drawing phylogenetic tree """ t = Tree(tx) if reroot: if outgroup: R = t.get_common_ancestor(*outgroup) else: # Calculate the midpoint node R = t.get_midpoint_outgroup() if R != t: t.set_outgroup(R) farthest, max_dist = t.get_farthest_leaf() margin = .05 xstart = margin ystart = 1 - margin canvas = 1 - rmargin - 2 * margin tip = .005 # scale the tree scale = canvas / max_dist num_leaves = len(t.get_leaf_names()) yinterval = canvas / (num_leaves + 1) # get exons structures, if any structures = {} if gffdir: gffiles = glob("{0}/*.gff*".format(gffdir)) setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True) structures = dict((a, (b, c)) for a, b, c in setups) if sizes: sizes = Sizes(sizes).mapping if barcodefile: barcodemap = DictFile(barcodefile, delimiter="\t") if leafcolorfile: leafcolors = DictFile(leafcolorfile, delimiter="\t") coords = {} i = 0 for n in t.traverse("postorder"): dist = n.get_distance(t) xx = xstart + scale * dist if n.is_leaf(): yy = ystart - i * yinterval i += 1 if trunc_name: name = truncate_name(n.name, rule=trunc_name) else: name = n.name if barcodefile: name = decode_name(name, barcodemap) sname = name.replace("_", "-") try: lc = leafcolors[n.name] except Exception: lc = leafcolor else: # if color is given as "R,G,B" if "," in lc: lc = map(float, lc.split(",")) ax.text(xx + tip, yy, sname, va="center", fontstyle="italic", size=leaffont, color=lc) gname = n.name.split("_")[0] if gname in structures: mrnabed, cdsbeds = structures[gname] ExonGlyph(ax, 1 - rmargin / 2, yy, mrnabed, cdsbeds, align="right", ratio=ratio) if sizes and gname in sizes: size = sizes[gname] size = size / 3 - 1 # base pair converted to amino acid size = "{0}aa".format(size) ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont) else: children = [coords[x] for x in n.get_children()] children_x, children_y = zip(*children) min_y, max_y = min(children_y), max(children_y) # plot the vertical bar ax.plot((xx, xx), (min_y, max_y), "k-") # plot the horizontal bar for cx, cy in children: ax.plot((xx, cx), (cy, cy), "k-") yy = sum(children_y) * 1. / len(children_y) support = n.support if support > 1: support = support / 100. if not n.is_root(): if support > scutoff / 100.: ax.text(xx, yy+.005, "{0:d}".format(int(abs(support * 100))), ha="right", size=leaffont, color=supportcolor) coords[n] = (xx, yy) # scale bar br = .1 x1 = xstart + .1 x2 = x1 + br * scale yy = ystart - i * yinterval ax.plot([x1, x1], [yy - tip, yy + tip], "k-") ax.plot([x2, x2], [yy - tip, yy + tip], "k-") ax.plot([x1, x2], [yy, yy], "k-") ax.text((x1 + x2) / 2, yy - tip, "{0:g}".format(br), va="top", ha="center", size=leaffont) if SH is not None: xs = x1 ys = (margin + yy) / 2. ax.text(xs, ys, "SH test against ref tree: {0}"\ .format(SH), ha="left", size=leaffont, color="g")
def __init__(self, fig, root, canvas, chr, xlim, datadir, order=None, hlsuffix=None, palette=None, cap=50, gauge="bottom", plot_label=True, plot_chr_label=True, gauge_step=5000000, vlines=None, labels_dict={}, diverge=('r', 'g')): x, y, w, h = canvas p = .01 root.add_patch( Rectangle((x - p, y - p), w + 2 * p, h + 2 * p, lw=1, fill=False, ec="darkslategray", zorder=10)) datafiles = glob(op.join(datadir, chr + "*")) if order: datafiles = [z for z in datafiles if z.split(".")[1] in order] datafiles.sort(key=lambda x: order.index(x.split(".")[1])) ntracks = len(datafiles) yinterval = h / ntracks yy = y + h if palette is None: # Get the palette set2 = get_map('Set2', 'qualitative', ntracks).mpl_colors else: set2 = [palette] * ntracks if gauge == "top": gauge_ax = fig.add_axes([x, yy + p, w, .0001]) adjust_spines(gauge_ax, ["top"]) tpos = yy + .07 elif gauge == "bottom": gauge_ax = fig.add_axes([x, y - p, w, .0001]) adjust_spines(gauge_ax, ["bottom"]) tpos = y - .07 start, end = xlim if gauge: fs = gauge_step < 1000000 setup_gauge_ax(gauge_ax, start, end, gauge_step, float_formatter=fs) if plot_chr_label: root.text(x + w / 2, tpos, chr, ha="center", va="center", color="darkslategray", size=16) yys = [] for label, datafile, c in zip(order, datafiles, set2): yy -= yinterval yys.append(yy) ax = fig.add_axes([x, yy, w, yinterval * .9]) xy = XYtrack(ax, datafile, color=c) xy.interpolate(end) xy.cap(ymax=cap) if vlines: xy.vlines(vlines) if hlsuffix: hlfile = op.join(datadir, ".".join((label, hlsuffix))) xy.import_hlfile(hlfile, chr, diverge=diverge) if plot_label: label = labels_dict.get(label, label.capitalize()) label = r"\textit{{{0}}}".format(label) root.text(x - .015, yy + yinterval / 2, label, ha="right", va="center") xy.draw() ax.set_xlim(*xlim) self.yys = yys
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--corr", default=False, action="store_true", help="Extra parameters for corrected data [default: %default]") p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) offset = guessoffset([fnames[0]]) phred64 = offset == 64 assert all(guessoffset([x]) == offset for x in fnames[1:]) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" extra = "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra) write_file(runfile, contents)
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.fetch import entrez p = OptionParser(htg.__doc__) p.add_option("--phases", default=None, help="Use another phasefile to override [default: %default]") p.add_option("--comment", default="", help="Comments for this update [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + \ ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase(glob("{0}/*".format(gbdir)) + \ ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "{qualifiers}"' acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print("{0}\t{1}\t{2}".\ format(accession_nv, oldphase, phase), file=newphasefw) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format(accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print("A total of {0} records updated.".format(nupdated), file=sys.stderr)
def draw_tree(ax, tx, rmargin=.3, treecolor="k", leafcolor="k", supportcolor="k", outgroup=None, reroot=True, gffdir=None, sizes=None, trunc_name=None, SH=None, scutoff=0, barcodefile=None, leafcolorfile=None, leaffont=12): """ main function for drawing phylogenetic tree """ t = Tree(tx) if reroot: if outgroup: R = t.get_common_ancestor(*outgroup) else: # Calculate the midpoint node R = t.get_midpoint_outgroup() if R != t: t.set_outgroup(R) farthest, max_dist = t.get_farthest_leaf() margin = .05 xstart = margin ystart = 1 - margin canvas = 1 - rmargin - 2 * margin tip = .005 # scale the tree scale = canvas / max_dist num_leaves = len(t.get_leaf_names()) yinterval = canvas / (num_leaves + 1) # get exons structures, if any structures = {} if gffdir: gffiles = glob("{0}/*.gff*".format(gffdir)) setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True) structures = dict((a, (b, c)) for a, b, c in setups) if sizes: sizes = Sizes(sizes).mapping if barcodefile: barcodemap = DictFile(barcodefile, delimiter="\t") if leafcolorfile: leafcolors = DictFile(leafcolorfile, delimiter="\t") coords = {} i = 0 for n in t.traverse("postorder"): dist = n.get_distance(t) xx = xstart + scale * dist if n.is_leaf(): yy = ystart - i * yinterval i += 1 if trunc_name: name = truncate_name(n.name, rule=trunc_name) else: name = n.name if barcodefile: name = decode_name(name, barcodemap) sname = name.replace("_", "-") try: lc = leafcolors[n.name] except Exception: lc = leafcolor else: # if color is given as "R,G,B" if "," in lc: lc = map(float, lc.split(",")) ax.text(xx + tip, yy, sname, va="center", fontstyle="italic", size=leaffont, color=lc) gname = n.name.split("_")[0] if gname in structures: mrnabed, cdsbeds = structures[gname] ExonGlyph(ax, 1 - rmargin / 2, yy, mrnabed, cdsbeds, align="right", ratio=ratio) if sizes and gname in sizes: size = sizes[gname] size = size / 3 - 1 # base pair converted to amino acid size = "{0}aa".format(size) ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont) else: children = [coords[x] for x in n.get_children()] children_x, children_y = zip(*children) min_y, max_y = min(children_y), max(children_y) # plot the vertical bar ax.plot((xx, xx), (min_y, max_y), "-", color=treecolor) # plot the horizontal bar for cx, cy in children: ax.plot((xx, cx), (cy, cy), "-", color=treecolor) yy = sum(children_y) * 1. / len(children_y) support = n.support if support > 1: support = support / 100. if not n.is_root(): if support > scutoff / 100.: ax.text(xx, yy + .005, "{0:d}".format(int(abs(support * 100))), ha="right", size=leaffont, color=supportcolor) coords[n] = (xx, yy) # scale bar br = .1 x1 = xstart + .1 x2 = x1 + br * scale yy = ystart - i * yinterval ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor) ax.plot([x1, x2], [yy, yy], "-", color=treecolor) ax.text((x1 + x2) / 2, yy - tip, "{0:g}".format(br), va="top", ha="center", size=leaffont, color=treecolor) if SH is not None: xs = x1 ys = (margin + yy) / 2. ax.text(xs, ys, "SH test against ref tree: {0}".format(SH), ha="left", size=leaffont, color="g") normalize_axes(ax)
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.fetch import entrez p = OptionParser(htg.__doc__) p.add_option( "--phases", default=None, help="Use another phasefile to override", ) p.add_option("--comment", default="", help="Comments for this update") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase( glob("{0}/*".format(gbdir)) + ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = "tbl2asn -a z -p fasta -r {sqndir}" acmd += " -i {splitfile} -t {sbtfile} -C tigr" acmd += ' -j "{qualifiers}"' acmd += " -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr" acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print("{0}\t{1}\t{2}".format(accession_nv, oldphase, phase), file=newphasefw) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format( accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment, ) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print("A total of {0} records updated.".format(nupdated), file=sys.stderr)
def get_fsnames(outdir): fnames = glob(op.join(outdir, "*.fa*")) suffix = "." + fnames[0].split(".")[-1] fsnames = [op.basename(x).rsplit(".", 1)[0] for x in fnames] return fsnames, suffix
mb_float_formatter = ticker.FuncFormatter(lambda x, pos: "{0:.1f}M".format(x / 1000000.0)) kb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}K".format(int(x / 1000))) tex_1digit_formatter = ticker.FuncFormatter(lambda x, pos: _("{0:.1f}".format(x))) tex_2digit_formatter = ticker.FuncFormatter(lambda x, pos: _("{0:.2f}".format(x))) def set_tex_axis(ax, formatter=tex_formatter): ax.xaxis.set_major_formatter(formatter) ax.yaxis.set_major_formatter(formatter) set_human_axis = partial(set_tex_axis, formatter=human_formatter) set_human_base_axis = partial(set_tex_axis, formatter=human_base_formatter) font_dir = op.join(op.dirname(__file__), "fonts") available_fonts = [op.basename(x) for x in glob(font_dir + "/*")] def fontprop(ax, name, size=12): assert name in available_fonts, "Font must be one of {0}.".format(available_fonts) import matplotlib.font_manager as fm fname = op.join(font_dir, name) prop = fm.FontProperties(fname=fname, size=size) logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file())) for text in ax.texts: text.set_fontproperties(prop)
human_readable_base = partial(human_readable, base=True) human_formatter = ticker.FuncFormatter(human_readable) human_base_formatter = ticker.FuncFormatter(human_readable_base) mb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}M".format(int(x / 1000000))) mb_float_formatter = ticker.FuncFormatter(lambda x, pos: "{0:.1f}M".format(x / 1000000.)) kb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}K".format(int(x / 1000))) def set_human_axis(ax, formatter=human_formatter): ax.xaxis.set_major_formatter(formatter) ax.yaxis.set_major_formatter(formatter) set_human_base_axis = partial(set_human_axis, formatter=human_base_formatter) available_fonts = [op.basename(x) for x in glob(datadir + "/*.ttf")] def fontprop(ax, name, size=12): assert name in available_fonts, "Font must be one of {0}.".\ format(available_fonts) import matplotlib.font_manager as fm fname = op.join(datadir, name) prop = fm.FontProperties(fname=fname, size=size) logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file())) for text in ax.texts: text.set_fontproperties(prop)
def set_human_axis(ax, formatter=human_formatter): ax.xaxis.set_major_formatter(formatter) ax.yaxis.set_major_formatter(formatter) set_human_base_axis = partial(set_human_axis, formatter=human_base_formatter) def set_helvetica_axis(ax): ax.set_xticklabels([int(x) for x in ax.get_xticks()], family="Helvetica") ax.set_yticklabels([int(x) for x in ax.get_yticks()], family="Helvetica") available_fonts = [op.basename(x) for x in glob(datadir + "/*.ttf")] def fontprop(ax, name, size=12): assert name in available_fonts, "Font must be one of {0}.".format(available_fonts) import matplotlib.font_manager as fm fname = op.join(datadir, name) prop = fm.FontProperties(fname=fname, size=size) logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file())) for text in ax.texts: text.set_fontproperties(prop)