def epoch(args): """ %prog epoch Illustrate the methods used in Maggie's epoch paper, in particular, how to classifiy S/G/F/FB/FN for the genes. """ p = OptionParser(__doc__) opts, args = p.parse_args() fig = plt.figure(1, (6, 4)) root = fig.add_axes([0, 0, 1, 1]) # Separators linestyle = dict(lw=2, color="b", alpha=.2, zorder=2) root.plot((0, 1), (.5, .5), "--", **linestyle) for i in (1./3, 2./3): root.plot((i, i), (.5, 1), "--", **linestyle) for i in (1./6, 3./6, 5./6): root.plot((i, i), (0, .5), "--", **linestyle) # Diagrams plot_diagram(root, 1./6, 3./4, "S", "syntenic") plot_diagram(root, 3./6, 3./4, "F", "missing, with both flankers") plot_diagram(root, 5./6, 3./4, "G", "missing, with one flanker") plot_diagram(root, 2./6, 1./4, "FB", "has non-coding matches") plot_diagram(root, 4./6, 1./4, "FN", "syntenic region has gap") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = fname() + ".pdf" savefig(figname, dpi=300)
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def traits(args): """ %prog traits directory Make HTML page that reports eye and skin color. """ p = OptionParser(traits.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) samples = [] for folder in args: targets = iglob(folder, "*-traits.json") if not targets: continue filename = targets[0] js = json.load(open(filename)) js["skin_rgb"] = make_rgb( js["traits"]["skin-color"]["L"], js["traits"]["skin-color"]["A"], js["traits"]["skin-color"]["B"]) js["eye_rgb"] = make_rgb( js["traits"]["eye-color"]["L"], js["traits"]["eye-color"]["A"], js["traits"]["eye-color"]["B"]) samples.append(js) template = Template(traits_template) fw = open("report.html", "w") print >> fw, template.render(samples=samples) logging.debug("Report written to `{}`".format(fw.name)) fw.close()
def bed(args): """ %prog bed genes.ids Get gene bed from phytozome. `genes.ids` contains the list of gene you want to pull from Phytozome. Write output to .bed file. """ p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) idsfile, = args ids = set(x.strip() for x in open(idsfile)) data = get_bed_from_phytozome(list(ids)) pf = idsfile.rsplit(".", 1)[0] bedfile = pf + ".bed" fw = open(bedfile, "w") for i, row in enumerate(data): row = row.strip() if row == "": continue print(row, file=fw) logging.debug("A total of {0} records written to `{1}`.".format(i + 1, bedfile))
def compile(args): """ %prog compile directory Extract telomere length and ccn. """ p = OptionParser(compile.__doc__) p.set_outfile(outfile="age.tsv") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) dfs = [] for folder in args: ofolder = os.listdir(folder) # telomeres subdir = [x for x in ofolder if x.startswith("telomeres")][0] subdir = op.join(folder, subdir) filename = op.join(subdir, "tel_lengths.txt") df = pd.read_csv(filename, sep="\t") d1 = df.ix[0].to_dict() # ccn subdir = [x for x in ofolder if x.startswith("ccn")][0] subdir = op.join(folder, subdir) filename = iglob(subdir, "*.ccn.json")[0] js = json.load(open(filename)) d1.update(js) df = pd.DataFrame(d1, index=[0]) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df.to_csv(opts.outfile, sep="\t", index=False)
def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def summary(args): """ %prog summary *.gff Print gene statistics table. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args for metric in metrics: logging.debug("Parsing files in `{0}`..".format(metric)) table = {} for x in gff_files: pf = op.basename(x).split(".")[0] numberfile = op.join(metric, pf + ".txt") ar = [int(x.strip()) for x in open(numberfile)] sum = SummaryStats(ar).todict().items() keys, vals = zip(*sum) keys = [(pf, x) for x in keys] table.update(dict(zip(keys, vals))) print >> sys.stderr, tabulate(table)
def histogram(args): """ %prog histogram *.gff Plot gene statistics based on output of stats. For each gff file, look to see if the metrics folder (i.e. Exon_Length) contains the data and plot them. """ from jcvi.graphics.histogram import histogram_multiple p = OptionParser(histogram.__doc__) p.add_option("--bins", dest="bins", default=40, type="int", help="number of bins to plot in the histogram [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count") colors = ("red", "green", "blue", "black") vmaxes = (1000, 1000, 4000, 20) xlabels = ("bp", "bp", "bp", "number") for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels): logging.debug("Parsing files in `{0}`..".format(metric)) numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \ for x in gff_files] histogram_multiple(numberfiles, 0, vmax, xlabel, metric, bins=opts.bins, facet=True, fill=color, prefix=metric + ".")
def unitigs(args): """ %prog unitigs best.edges Reads Celera Assembler's "best.edges" and extract all unitigs. """ p = OptionParser(unitigs.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args G = read_graph(bestedges, maxerr=opts.maxerr, directed=True) H = nx.Graph() intconv = lambda x: int(x.split("-")[0]) for k, v in G.iteritems(): if k == G.get(v, None): H.add_edge(intconv(k), intconv(v)) nunitigs = nreads = 0 for h in nx.connected_component_subgraphs(H, copy=False): st = [x for x in h if h.degree(x) == 1] if len(st) != 2: continue src, target = st path = list(nx.all_simple_paths(h, src, target)) assert len(path) == 1 path, = path print "|".join(str(x) for x in path) nunitigs += 1 nreads += len(path) logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
def tracedb(args): """ %prog tracedb <xml|lib|frg> Run `tracedb-to-frg.pl` within current folder. """ p = OptionParser(tracedb.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) action, = args assert action in ("xml", "lib", "frg") CMD = "tracedb-to-frg.pl" xmls = glob("xml*") if action == "xml": for xml in xmls: cmd = CMD + " -xml {0}".format(xml) sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True) elif action == "lib": cmd = CMD + " -lib {0}".format(" ".join(xmls)) sh(cmd) elif action == "frg": for xml in xmls: cmd = CMD + " -frg {0}".format(xml) sh(cmd, background=True)
def ids(args): """ %prog ids cdhit.clstr Get the representative ids from clstr file. """ p = OptionParser(ids.__doc__) p.add_option("--prefix", type="int", help="Find rep id for prefix of len [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clstrfile, = args cf = ClstrFile(clstrfile) prefix = opts.prefix if prefix: reads = list(cf.iter_reps_prefix(prefix=prefix)) else: reads = list(cf.iter_reps()) nreads = len(reads) idsfile = clstrfile.replace(".clstr", ".ids") fw = open(idsfile, "w") for i, name in reads: print("\t".join(str(x) for x in (i, name)), file=fw) logging.debug("A total of {0} unique reads written to `{1}`.".\ format(nreads, idsfile)) fw.close() return idsfile
def csv(args): """ %prog csv excelfile Convert EXCEL to csv file. """ from xlrd import open_workbook p = OptionParser(csv.__doc__) p.set_sep(sep=',') opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) excelfile, = args sep = opts.sep csvfile = excelfile.rsplit(".", 1)[0] + ".csv" wb = open_workbook(excelfile) fw = open(csvfile, "w") for s in wb.sheets(): print >> sys.stderr, 'Sheet:',s.name for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(s.cell(row, col).value) print >> fw, sep.join(str(x) for x in values)
def passthrough(args): """ %prog passthrough chrY.vcf chrY.new.vcf Pass through Y and MT vcf. """ p = OptionParser(passthrough.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, newvcffile = args fp = open(vcffile) fw = open(newvcffile, "w") gg = ["0/0", "0/1", "1/1"] for row in fp: if row[0] == "#": print(row.strip(), file=fw) continue v = VcfLine(row) v.filter = "PASS" v.format = "GT:GP" probs = [0] * 3 probs[gg.index(v.genotype)] = 1 v.genotype = v.genotype.replace("/", "|") + \ ":{0}".format(",".join("{0:.3f}".format(x) for x in probs)) print(v, file=fw) fw.close()
def agp(args): """ %prog agp <fastafile|sizesfile> Convert the sizes file to a trivial AGP file. """ from jcvi.formats.agp import OO p = OptionParser(agp.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) sizesfile, = args sizes = Sizes(sizesfile) agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp" fw = open(agpfile, "w") o = OO() # Without a filename for ctg, size in sizes.iter_sizes(): o.add(ctg, ctg, size) o.write_AGP(fw) fw.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) return agpfile
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def fromimpute2(args): """ %prog fromimpute2 impute2file fastafile 1 Convert impute2 output to vcf file. Imputed file looks like: --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002 """ p = OptionParser(fromimpute2.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) impute2file, fastafile, chr = args fasta = Fasta(fastafile) print get_vcfstanza(fastafile, fasta) fp = open(impute2file) seen = set() for row in fp: snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split() pos = int(pos) if pos in seen: continue seen.add(pos) code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1] tag = "PR" if snp_id == chr else "IM" print "\t".join(str(x) for x in \ (chr, pos, rsid, ref, alt, ".", ".", tag, \ "GT:GP", code + ":" + ",".join((aa, ab, bb))))
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print row.strip() continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print vv[0] continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print bestv
def sample(args): """ %prog sample vcffile 0.9 Sample subset of vcf file. """ from random import random p = OptionParser(sample.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, ratio = args ratio = float(ratio) fp = open(vcffile) pf = vcffile.rsplit(".", 1)[0] kept = pf + ".kept.vcf" withheld = pf + ".withheld.vcf" fwk = open(kept, "w") fww = open(withheld, "w") nkept = nwithheld = 0 for row in fp: if row[0] == '#': print >> fwk, row.strip() continue if random() < ratio: nkept += 1 print >> fwk, row.strip() else: nwithheld += 1 print >> fww, row.strip() logging.debug("{0} records kept to `{1}`".format(nkept, kept)) logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join(str(x) for x in ( chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id )) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def genestatus(args): """ %prog genestatus diploid.gff3.exon.ids Tag genes based on translation from GMAP models, using fasta.translate() --ids. """ p = OptionParser(genestatus.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) idsfile, = args data = get_tags(idsfile) key = lambda x: x[0].split(".")[0] for gene, cc in groupby(data, key=key): cc = list(cc) tags = [x[-1] for x in cc] if "complete" in tags: tag = "complete" elif "partial" in tags: tag = "partial" else: tag = "pseudogene" print "\t".join((gene, tag))
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option("--flank", default=5000, type="int", help="Get the seq of size on two ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) goodfasta, = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print >> fw, "\t".join(str(x) for x in \ (bac, 0, min(flank, size), bac + "L")) print >> fw, "\t".join(str(x) for x in \ (bac, max(size - flank, 0), size, bac + "R")) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def spades(args): """ %prog spades folder Run automated SPADES. """ from jcvi.formats.fastq import readlen p = OptionParser(spades.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) folder, = args for p, pf in iter_project(folder, 2): rl = readlen([p[0], "--silent"]) # <http://spades.bioinf.spbau.ru/release3.1.0/manual.html#sec3.4> kmers = None if rl >= 150: kmers = "21,33,55,77" elif rl >= 250: kmers = "21,33,55,77,99,127" cmd = "spades.py" if kmers: cmd += " -k {0}".format(kmers) cmd += " --careful" cmd += " --pe1-1 {0} --pe1-2 {1}".format(*p) cmd += " -o {0}_spades".format(pf) print cmd
def mergecsv(args): """ %prog mergecsv *.csv Combine CSV into binary array. """ p = OptionParser(mergecsv.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) csvfiles = args arrays = [] samplekeys = [] for csvfile in csvfiles: samplekey = op.basename(csvfile).split(".")[0] a = np.fromfile(csvfile, sep=",", dtype=np.int32) x1 = a[::2] x2 = a[1::2] a = x1 * 1000 + x2 a[a < 0] = -1 arrays.append(a) samplekeys.append(samplekey) print >> sys.stderr, samplekey, a print >> sys.stderr, "Merging" b = np.concatenate(arrays) b.tofile("data.bin") fw = open("samples", "w") print >> fw, "\n".join(samplekeys) fw.close()
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.set_outdir(outdir="outdir") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def diff(args): """ %prog diff simplefile Calculate difference of pairwise syntenic regions. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(diff.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) simplefile, = args fp = open(simplefile) data = [x.split() for x in fp] spans = [] for block_id, ab in groupby(data[1:], key=lambda x: x[0]): a, b = list(ab) aspan, bspan = a[4], b[4] aspan, bspan = int(aspan), int(bspan) spans.append((aspan, bspan)) aspans, bspans = zip(*spans) dspans = [b - a for a, b, in spans] s = SummaryStats(dspans) print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans)) print >> sys.stderr, "Sum of A: {0}".format(sum(aspans)) print >> sys.stderr, "Sum of B: {0}".format(sum(bspans)) print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
def liftover(args): """ %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa LiftOver CODIS/Y-STR markers. """ p = OptionParser(liftover.__doc__) p.add_option("--checkvalid", default=False, action="store_true", help="Check minscore, period and length") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) refbed, fastafile = args genome = pyfasta.Fasta(fastafile) edits = [] fp = open(refbed) for i, row in enumerate(fp): s = STRLine(row) seq = genome[s.seqid][s.start - 1: s.end].upper() s.motif = get_motif(seq, len(s.motif)) s.fix_counts(seq) if opts.checkvalid and not s.is_valid(): continue edits.append(s) if i % 10000 == 0: print >> sys.stderr, i, "lines read" edits = natsorted(edits, key=lambda x: (x.seqid, x.start)) for e in edits: print str(e)
def summary(args): """ %prog summary old.new.chain old.fasta new.fasta Provide stats of the chain file. """ from jcvi.formats.fasta import summary as fsummary from jcvi.utils.cbook import percentage, human_size p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) chainfile, oldfasta, newfasta = args chain = Chain(chainfile) ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq print >> sys.stderr, "File `{0}` contains {1} chains.".\ format(chainfile, len(chain)) print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\ format(human_size(ungapped), human_size(dt), human_size(dq)) oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"]) print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\ format(oldfasta, percentage(ungapped, oldreal)) newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"]) print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\ format(newfasta, percentage(ungapped, newreal))
def run(args): """ %prog run command ::: file1 file2 Parallelize a set of commands on grid. The syntax is modeled after GNU parallel <http://www.gnu.org/s/parallel/man.html#options> {} - input line {.} - input line without extension {_} - input line first part {/} - basename of input line {/.} - basename of input line without extension {/_} - basename of input line first part {#} - sequence number of job to run ::: - Use arguments from the command line as input source instead of stdin (standard input). If file name is `t/example.tar.gz`, then, {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example" {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example" A few examples: ls -1 *.fastq | %prog run process {} {.}.pdf # use stdin %prog run process {} {.}.pdf ::: *fastq # use ::: %prog run "zcat {} > {.}" ::: *.gz # quote redirection %prog run < commands.list # run a list of commands """ p = OptionParser(run.__doc__) p.set_grid_opts() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) sep = ":::" if sep in args: sepidx = args.index(sep) filenames = args[sepidx + 1:] args = args[:sepidx] if not filenames: filenames = [""] else: filenames = sys.stdin if not sys.stdin.isatty() else [""] cmd = " ".join(args) cmds = [] if filenames else [(cmd, None)] for i, filename in enumerate(filenames): filename = filename.strip() noextname = filename.rsplit(".", 1)[0] prefix, basename = op.split(filename) basenoextname = basename.rsplit(".", 1)[0] basefirstname = basename.split(".")[0] firstname = op.join(prefix, basefirstname) ncmd = cmd if "{" in ncmd: ncmd = ncmd.replace("{}", filename) else: ncmd += " " + filename ncmd = ncmd.replace("{.}", noextname) ncmd = ncmd.replace("{_}", firstname) ncmd = ncmd.replace("{/}", basename) ncmd = ncmd.replace("{/.}", basenoextname) ncmd = ncmd.replace("{/_}", basefirstname) ncmd = ncmd.replace("{#}", str(i)) outfile = None if ">" in ncmd: ncmd, outfile = ncmd.split(">", 1) ncmd, outfile = ncmd.strip(), outfile.strip() ncmd = ncmd.strip() cmds.append((ncmd, outfile)) for ncmd, outfile in cmds: p = GridProcess(ncmd, outfile=outfile, grid_opts=opts) p.start()
def optimize(args): """ %prog optimize test.clm Optimize the contig order and orientation, based on CLM file. """ p = OptionParser(optimize.__doc__) p.add_option("--skiprecover", default=False, action="store_true", help="Do not import 'recover' contigs") p.add_option("--startover", default=False, action="store_true", help="Do not resume from existing tour file") p.add_option("--skipGA", default=False, action="store_true", help="Skip GA step") p.set_outfile(outfile=None) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clmfile, = args startover = opts.startover runGA = not opts.skipGA cpus = opts.cpus # Load contact map clm = CLMFile(clmfile, skiprecover=opts.skiprecover) tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour" if startover: tourfile = None tour = clm.activate(tourfile=tourfile) fwtour = open(tourfile, "w") # Store INIT tour print_tour(fwtour, clm.tour, "INIT", clm.active_contigs, clm.oo, signs=clm.signs) if runGA: for phase in range(1, 3): tour = optimize_ordering(fwtour, clm, phase, cpus) tour = clm.prune_tour(tour, cpus) # Flip orientations phase = 1 while True: tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus) if tag1 == REJECT and tag2 == REJECT: logging.debug("Terminating ... no more {}".format(ACCEPT)) break phase += 1 fwtour.close()
def simulate(args): """ %prog simulate test Simulate CLM and IDS files with given names. The simulator assumes several distributions: - Links are distributed uniformly across genome - Log10(link_size) are distributed normally - Genes are distributed uniformly """ p = OptionParser(simulate.__doc__) p.add_option("--genomesize", default=10000000, type="int", help="Genome size") p.add_option("--genes", default=1000, type="int", help="Number of genes") p.add_option("--contigs", default=100, type="int", help="Number of contigs") p.add_option("--coverage", default=10, type="int", help="Link coverage") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pf, = args GenomeSize = opts.genomesize Genes = opts.genes Contigs = opts.contigs Coverage = opts.coverage PE = 500 Links = int(GenomeSize * Coverage / PE) # Simulate the contig sizes that sum to GenomeSize # See also: # <https://en.wikipedia.org/wiki/User:Skinnerd/Simplex_Point_Picking> ContigSizes, = np.random.dirichlet([1] * Contigs, 1) * GenomeSize ContigSizes = np.array(np.round_(ContigSizes, decimals=0), dtype=int) ContigStarts = np.zeros(Contigs, dtype=int) ContigStarts[1:] = np.cumsum(ContigSizes)[:-1] # Write IDS file idsfile = pf + ".ids" fw = open(idsfile, "w") for i, s in enumerate(ContigSizes): print >> fw, "tig{:04d}\t{}".format(i, s) fw.close() # Simulate the gene positions GenePositions = np.sort( np.random.random_integers(0, GenomeSize - 1, size=Genes)) write_last_and_beds(pf, GenePositions, ContigStarts) # Simulate links, uniform start, with link distances following 1/x, where x # is the distance between the links. As an approximation, we have link sizes # between [1e3, 1e7], so we map from uniform [1e-7, 1e-3] LinkStarts = np.sort( np.random.random_integers(0, GenomeSize - 1, size=Links)) a, b = 1e-7, 1e-3 LinkSizes = np.array(np.round_(1 / ((b - a) * np.random.rand(Links) + a), decimals=0), dtype="int") LinkEnds = LinkStarts + LinkSizes # Find link to contig membership LinkStartContigs = np.searchsorted(ContigStarts, LinkStarts) - 1 LinkEndContigs = np.searchsorted(ContigStarts, LinkEnds) - 1 # Extract inter-contig links InterContigLinks = (LinkStartContigs != LinkEndContigs) & \ (LinkEndContigs != Contigs) ICLinkStartContigs = LinkStartContigs[InterContigLinks] ICLinkEndContigs = LinkEndContigs[InterContigLinks] ICLinkStarts = LinkStarts[InterContigLinks] ICLinkEnds = LinkEnds[InterContigLinks] # Write CLM file write_clm(pf, ICLinkStartContigs, ICLinkEndContigs, ICLinkStarts, ICLinkEnds, ContigStarts, ContigSizes)
def score(args): """ %prog score main_results/ cached_data/ contigsfasta Score the current LACHESIS CLM. """ p = OptionParser(score.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) mdir, cdir, contigsfasta = args orderingfiles = natsorted(iglob(mdir, "*.ordering")) sizes = Sizes(contigsfasta) contig_names = list(sizes.iter_names()) contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) oo = [] # Load contact matrix glm = op.join(cdir, "all.GLM") N = len(contig_ids) M = np.zeros((N, N), dtype=int) fp = open(glm) for row in fp: if row[0] == '#': continue x, y, z = row.split() if x == 'X': continue M[int(x), int(y)] = int(z) fwtour = open("tour", "w") def callback(tour, gen, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA-{0}".format(gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness print_tour(fwtour, tour, label, contig_names, oo) return tour for ofile in orderingfiles: co = ContigOrdering(ofile) for x in co: contig_id = contig_ids[x.contig_name] oo.append(contig_id) pf = op.basename(ofile).split(".")[0] print pf print oo tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) # Store INIT tour print_tour(fwtour, tour, "INIT", contig_names, oo) # Faster Cython version for evaluation from .chic import score_evaluate_M callbacki = partial(callback, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, callback=callbacki) print tour, tour.fitness break fwtour.close()
def main(): p = OptionParser(__doc__) p.add_option( "--switch", help="Rename the seqid with two-column file [default: %default]") p.add_option( "--tree", help="Display trees on the bottom of the figure [default: %default]") p.add_option("--extra", help="Extra features in BED format") p.add_option("--scalebar", default=False, action="store_true", help="Add scale bar to the plot") opts, args, iopts = p.set_image_options(figsize="8x7") if len(args) != 3: sys.exit(not p.print_help()) datafile, bedfile, layoutfile = args switch = opts.switch tree = opts.tree pf = datafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) Synteny(fig, root, datafile, bedfile, layoutfile, switch=switch, tree=tree, extra_features=opts.extra, scalebar=opts.scalebar) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG If coord-sorted BAM is provided, then it will use it as starting point Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format( genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def embed(args): """ %prog embed evidencefile scaffolds.fasta contigs.fasta Use SSPACE evidencefile to scaffold contigs into existing scaffold structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE directly to scaffold. Rules: 1. Only update existing structure by embedding contigs small enough to fit. 2. Promote singleton contigs only if they are big (>= min_length). """ p = OptionParser(embed.__doc__) p.set_mingap(default=10) p.add_option("--min_length", default=200, type="int", help="Minimum length to consider [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) evidencefile, scaffolds, contigs = args min_length = opts.min_length splitfasta, oagp, cagp = gaps( [scaffolds, "--split", "--mingap={0}".format(opts.mingap)]) agp = AGP(cagp) p = agp.graph ef = EvidenceFile(evidencefile, contigs) sizes = ef.sz q = ef.graph logging.debug("Reference graph: {0}".format(p)) logging.debug("Patch graph: {0}".format(q)) newagp = deepcopy(agp) seen = set() deleted = set() for a in agp: if a.is_gap: continue name = a.component_id object = a.object if name in deleted: print >> sys.stderr, "* Skip {0}, already embedded".format(name) continue seen.add(name) target_name, tag = get_target(p, name) path = q.get_path(name, target_name, tag=tag) path_size = sum([sizes[x.v] for x, t in path]) if path else None status = NO_UPDATE # Heuristic, the patch must not be too long if path and path_size > min_length and len(path) > 3: path = None if not path: print >> sys.stderr, name, target_name, path, path_size, status continue backward = False for x, t in path: if x.v in seen: print >> sys.stderr, "* Does not allow backward" \ " patch on {0}".format(x.v) backward = True break if backward: continue # Build the path plus the ends vv = q.get_node(name) path.appendleft((vv, tag)) if tag == ">": path.reverse() status = INSERT_BEFORE elif target_name is None: status = INSERT_AFTER else: target = q.get_node(target_name) path.append((target, tag)) status = INSERT_BETWEEN print >> sys.stderr, name, target_name, path, path_size, status # Trim the ends off from the constructed AGPLines lines = path_to_agp(q, path, object, sizes, status) if status == INSERT_BEFORE: lines = lines[:-1] td = newagp.insert_lines(name, lines, \ delete=True, verbose=True) elif status == INSERT_AFTER: lines = lines[1:] td = newagp.insert_lines(name, lines, after=True, \ delete=True, verbose=True) else: lines = lines[1:-1] td = newagp.update_between(name, target_name, lines, \ delete=True, verbose=True) deleted |= td seen |= td # Recruite big singleton contigs CUTOFF = opts.min_length for ctg, size in sizes.items(): if ctg in seen: continue if size < CUTOFF: continue newagp.append(AGPLine.cline(ctg, ctg, sizes, "?")) # Write a new AGP file newagpfile = "embedded.agp" newagp.print_to_file(newagpfile, index=True) tidy([newagpfile, contigs])
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_home("jellyfish") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) jfcmd = op.join(opts.jellyfish_home, "jellyfish") cmd = jfcmd cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.set_rclip(rclip=1) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option("--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]") opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) rclip = opts.rclip ctgfasta = args[0] duos = list(grouper(args[1:], 2)) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [bedfile, "--lib", "--nointra", "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff)] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\ format(prefix) sh(cmd) final = "final" cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \ "-merge -detail -oo -sum -o {1}".format(prefix, final) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])
def ld(args): """ %prog ld map Calculate pairwise linkage disequilibrium given MSTmap. """ from random import sample from jcvi.algorithms.matrix import symmetrize p = OptionParser(ld.__doc__) p.add_option( "--subsample", default=1000, type="int", help="Subsample markers to speed up", ) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) (mstmap, ) = args subsample = opts.subsample data = MSTMap(mstmap) markerbedfile = mstmap + ".subsample.bed" ldmatrix = mstmap + ".subsample.matrix" # Take random subsample while keeping marker order if subsample < data.nmarkers: data = [data[x] for x in sorted(sample(range(len(data)), subsample))] else: logging.debug("Use all markers, --subsample ignored") nmarkers = len(data) if need_update(mstmap, (ldmatrix, markerbedfile)): fw = open(markerbedfile, "w") print("\n".join(x.bedline for x in data), file=fw) logging.debug("Write marker set of size {0} to file `{1}`.".format( nmarkers, markerbedfile)) fw.close() M = np.zeros((nmarkers, nmarkers), dtype=float) for i, j in combinations(range(nmarkers), 2): a = data[i] b = data[j] M[i, j] = calc_ldscore(a.genotype, b.genotype) M = symmetrize(M) logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix)) M.tofile(ldmatrix) else: nmarkers = len(Bed(markerbedfile)) M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers) logging.debug("LD matrix `{0}` exists ({1}x{1}).".format( ldmatrix, nmarkers)) from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap plt.rcParams["axes.linewidth"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # the heatmap ax.matshow(M, cmap=iopts.cmap) # Plot chromosomes breaks bed = Bed(markerbedfile) xsize = len(bed) extent = (0, nmarkers) chr_labels = [] ignore_size = 20 for (seqid, beg, end) in bed.get_breaks(): ignore = abs(end - beg) < ignore_size pos = (beg + end) / 2 chr_labels.append((seqid, pos, ignore)) if ignore: continue ax.plot((end, end), extent, "w-", lw=1) ax.plot(extent, (end, end), "w-", lw=1) # Plot chromosome labels for label, pos, ignore in chr_labels: pos = 0.1 + pos * 0.8 / xsize if not ignore: root.text(pos, 0.91, label, ha="center", va="bottom", rotation=45, color="grey") root.text(0.09, pos, label, ha="right", va="center", color="grey") ax.set_xlim(extent) ax.set_ylim(extent) ax.set_axis_off() draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap) root.add_patch(Rectangle((0.1, 0.1), 0.8, 0.8, fill=False, ec="k", lw=2)) m = mstmap.split(".")[0] root.text(0.5, 0.06, "Linkage Disequilibrium between {0} markers".format(m), ha="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = m + ".subsample" + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def multihistogram(args): """ %prog multihistogram *.histogram species Plot the histogram based on a set of K-mer hisotograms. The method is based on Star et al.'s method (Atlantic Cod genome paper). """ p = OptionParser(multihistogram.__doc__) p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive") p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive") p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive") p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive") opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300) if len(args) < 1: sys.exit(not p.print_help()) histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) A = fig.add_axes([.08, .12, .38, .76]) B = fig.add_axes([.58, .12, .38, .76]) lines = [] legends = [] genomesizes = [] for histfile in histfiles: ks = KmerSpectrum(histfile) x, y = ks.get_xy(opts.vmin, opts.vmax) K = get_number(op.basename(histfile).split(".")[0].split("-")[-1]) if not opts.kmin <= K <= opts.kmax: continue line, = A.plot(x, y, '-', lw=1) lines.append(line) legends.append("K = {0}".format(K)) ks.analyze(K=K) genomesizes.append((K, ks.genomesize / 1e6)) leg = A.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(.5) title = "{0} genome K-mer histogram".format(species) A.set_title(markup(title)) xlabel, ylabel = "Coverage (X)", "Counts" A.set_xlabel(xlabel) A.set_ylabel(ylabel) set_human_axis(A) title = "{0} genome size estimate".format(species) B.set_title(markup(title)) x, y = zip(*genomesizes) B.plot(x, y, "ko", mfc='w') t = np.linspace(opts.kmin - .5, opts.kmax + .5, 100) p = np.poly1d(np.polyfit(x, y, 2)) B.plot(t, p(t), "r:") xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)" B.set_xlabel(xlabel) B.set_ylabel(ylabel) set_ticklabels_helvetica(B) labels = ((.04, .96, 'A'), (.54, .96, 'B')) panel_labels(root, labels) normalize_axes(root) imagename = species + ".multiK.pdf" savefig(imagename, dpi=iopts.dpi, iopts=iopts)
def demo(args): """ %prog demo Draw sample gene features to illustrate the various fates of duplicate genes - to be used in a book chapter. """ p = OptionParser(demo.__doc__) opts, args = p.parse_args(args) fig = plt.figure(1, (8, 5)) root = fig.add_axes([0, 0, 1, 1]) panel_space = .23 dup_space = .025 # Draw a gene and two regulatory elements at these arbitrary locations locs = [ (.5, .9), # ancestral gene (.5, .9 - panel_space + dup_space), # identical copies (.5, .9 - panel_space - dup_space), (.5, .9 - 2 * panel_space + dup_space), # degenerate copies (.5, .9 - 2 * panel_space - dup_space), (.2, .9 - 3 * panel_space + dup_space), # sub-functionalization (.2, .9 - 3 * panel_space - dup_space), (.5, .9 - 3 * panel_space + dup_space), # neo-functionalization (.5, .9 - 3 * panel_space - dup_space), (.8, .9 - 3 * panel_space + dup_space), # non-functionalization (.8, .9 - 3 * panel_space - dup_space), ] default_regulator = "gm" regulators = [ default_regulator, default_regulator, default_regulator, "wm", default_regulator, "wm", "gw", "wb", default_regulator, "ww", default_regulator, ] width = .24 for i, (xx, yy) in enumerate(locs): regulator = regulators[i] x1, x2 = xx - .5 * width, xx + .5 * width Glyph(root, x1, x2, yy) if i == 9: # upper copy for non-functionalization continue # coding region x1, x2 = xx - .16 * width, xx + .45 * width Glyph(root, x1, x2, yy, fc="k") # two regulatory elements x1, x2 = xx - .4 * width, xx - .28 * width for xx, fc in zip((x1, x2), regulator): if fc == 'w': continue DoubleCircle(root, xx, yy, fc=fc) rotation = 30 tip = .02 if i == 0: ya = yy + tip root.text(x1, ya, "Flower", rotation=rotation, va="bottom") root.text(x2, ya, "Root", rotation=rotation, va="bottom") elif i == 7: ya = yy + tip root.text(x2, ya, "Leaf", rotation=rotation, va="bottom") # Draw arrows between panels (center) arrow_dist = .08 ar_xpos = .5 for ar_ypos in (.3, .53, .76): root.annotate(" ", (ar_xpos, ar_ypos), (ar_xpos, ar_ypos + arrow_dist), arrowprops=arrowprops) ar_ypos = .3 for ar_xpos in (.2, .8): root.annotate(" ", (ar_xpos, ar_ypos), (.5, ar_ypos + arrow_dist), arrowprops=arrowprops) # Duplication, Degeneration xx = .6 ys = (.76, .53) processes = ("Duplication", "Degeneration") for yy, process in zip(ys, processes): root.text(xx, yy + .02, process, fontweight="bold") # Label of fates xs = (.2, .5, .8) fates = ("Subfunctionalization", "Neofunctionalization", "Nonfunctionalization") yy = .05 for xx, fate in zip(xs, fates): RoundLabel(root, xx, yy, fate) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = "demo.pdf" savefig(figname, dpi=300)
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Exclude kmers with less than ci counts") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--cx", default=None, type="int", help="Exclude kmers with more than cx counts") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ/FASTA") p.add_option("--fasta", default=False, action="store_true", help="Input is FASTA instead of FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \ "*.fq,*.fq.gz,*.fastq,*.fastq.gz" mm = MakeManager() for p, pf in iter_project(folder, pattern=pattern, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print("\n".join(p), file=fw) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) if opts.cx: cmd += " -cx{}".format(opts.cx) if opts.fasta: cmd += " -fm" cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
grouper.join(g.parent, g.left_child, g.right_child) parents = {} for i, group in enumerate(grouper): for g in group: parents[g] = i partitions = [[parents.get(a, x), x] for a, x in names] for key, parts in groupby(partitions, key=lambda x: x[0]): yield list(x[1] for x in parts) def main(args): cdt_file, nwk_file = args cdt = CDT(cdt_file) cdt.get_gtr_tree() cdt.print_newick(nwk_file) if __name__ == '__main__': p = OptionParser(__doc__) opts, args = p.parse_args() if len(args) != 2: sys.exit(not p.print_help()) main(args)
def dotplot(args): """ %prog dotplot map.csv ref.fasta Make dotplot between chromosomes and linkage maps. The input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 """ from natsort import natsorted from jcvi.assembly.allmaps import CSVMapLine from jcvi.formats.sizes import Sizes from jcvi.graphics.base import shorten from jcvi.graphics.dotplot import ( plt, savefig, markup, normalize_axes, downsample, plot_breaks_and_labels, thousands, ) p = OptionParser(dotplot.__doc__) p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 2: sys.exit(not p.print_help()) csvfile, fastafile = args sizes = natsorted(Sizes(fastafile).mapping.items()) seen = set() raw_data = [] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # the dot plot fp = must_open(csvfile) for row in fp: m = CSVMapLine(row) seen.add(m.seqid) raw_data.append(m) # X-axis is the genome assembly ctgs, ctg_sizes = zip(*sizes) xsize = sum(ctg_sizes) qb = list(np.cumsum(ctg_sizes)) qbreaks = list(zip(ctgs, [0] + qb, qb)) qstarts = dict(zip(ctgs, [0] + qb)) # Y-axis is the map key = lambda x: x.lg raw_data.sort(key=key) ssizes = {} for lg, d in groupby(raw_data, key=key): ssizes[lg] = max([x.cm for x in d]) ssizes = natsorted(ssizes.items()) lgs, lg_sizes = zip(*ssizes) ysize = sum(lg_sizes) sb = list(np.cumsum(lg_sizes)) sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb)) sstarts = dict(zip(lgs, [0] + sb)) # Re-code all the scatter dots data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, "g") for x in raw_data if (x.seqid in qstarts)] npairs = downsample(data) x, y, c = zip(*data) ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) # Flip X-Y label gy, gx = op.basename(csvfile).split(".")[:2] gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30) xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks) ax.set_xlim(xlim) ax.set_ylim(ylim) title = "Alignment: {} vs {}".format(gx, gy) title += " ({} markers)".format(thousands(npairs)) root.set_title(markup(title), x=0.5, y=0.96, color="k") logging.debug(title) normalize_axes(root) image_name = opts.outfile or (csvfile.rsplit(".", 1)[0] + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option( "-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes", ) p.add_option( "-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps", ) p.add_option( "--pad0", default=6, type="int", help="Pad gene identifiers with 0", ) p.add_option( "--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds", ) p.add_option("--prefix", default="Bo", help="Genome prefix") p.add_option( "--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1", ) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == "C" digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print("\t".join((oldaccn, accn)), file=fw) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def main(args): p = OptionParser(__doc__) p.set_beds() p.add_option( "--quota", default="1:1", help="`quota mapping` procedure -- screen blocks to constrain mapping" " (useful for orthology), " "put in the format like (#subgenomes expected for genome X):" "(#subgenomes expected for genome Y)", ) p.add_option( "--Nm", dest="Nmax", type="int", default=10, help="distance cutoff to tolerate two blocks that are " "slightly overlapping (cutoff for `quota mapping`) " "[default: %default units (gene or bp dist)]", ) p.add_option( "--self", dest="self_match", action="store_true", default=False, help="you might turn this on when screening paralogous blocks, " "esp. if you have reduced mirrored blocks into non-redundant set", ) p.set_verbose(help="Show verbose solver output") p.add_option( "--screen", default=False, action="store_true", help="generate new anchors file", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (qa_file, ) = args _, _, qorder, sorder, _ = check_beds(qa_file, p, opts) # sanity check for the quota if opts.quota: try: qa, qb = opts.quota.split(":") qa, qb = int(qa), int(qb) except ValueError: logging.error( "quota string should be the form x:x (2:4, 1:3, etc.)") sys.exit(1) if opts.self_match and qa != qb: raise Exception("when comparing genome to itself, " "quota must be the same number " "(like 1:1, 2:2) you have %s" % opts.quota) quota = (qa, qb) self_match = opts.self_match clusters = read_clusters(qa_file, qorder, sorder) for cluster in clusters: assert len(cluster) > 0 # below runs `quota mapping` work_dir = op.join(op.dirname(op.abspath(qa_file)), "work") selected_ids = solve_lp( clusters, quota, work_dir=work_dir, Nmax=opts.Nmax, self_match=self_match, verbose=opts.verbose, ) logging.debug("Selected %d blocks", len(selected_ids)) prefix = qa_file.rsplit(".", 1)[0] suffix = "{}x{}".format(qa, qb) outfile = ".".join((prefix, suffix)) fw = must_open(outfile, "w") print(",".join(str(x) for x in selected_ids), file=fw) fw.close() logging.debug("Screened blocks ids written to `%s`", outfile) if opts.screen: from jcvi.compara.synteny import screen new_qa_file = ".".join((prefix, suffix, "anchors")) largs = [qa_file, new_qa_file, "--ids", outfile] if opts.qbed and opts.sbed: largs += ["--qbed={0}".format(opts.qbed)] largs += ["--sbed={0}".format(opts.sbed)] screen(largs)
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bedfile, ) = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print( current_chr, len(sbed), "==>", len(ranks), "==>", len(lranks), file=sys.stderr, ) granks = set( gene_name( current_chr, x, prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) for x in lranks) | set( gene_name(current_chr, x, prefix=opts.prefix, pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print("\t".join((str(s), "|".join(tag))))
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = "".join([blast_bin, "/", blast_program]) cpus = opts.cpus if cpus > 1: logging.debug("Dispatch job to %d cpus" % cpus) outdir = "outdir" fs = split([afasta_fn, outdir, str(cpus)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ["blastp", "blastx"] \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) if extra: blast_cmd += " " + extra.strip() args = [(k + 1, cpus, out_fh, blast_cmd, query, lock) \ for k, query in zip(range(cpus), queries)] g = Jobs(target=blastplus, args=args) g.run()
def plot(args): """ %prog plot tagged.new.bed chr1 Plot gene identifiers along a particular chromosome, often to illustrate the gene id assignment procedure. """ from jcvi.graphics.base import plt, savefig from jcvi.graphics.chromosome import ChromosomeMap p = OptionParser(plot.__doc__) p.add_option("--firstn", type="int", help="Only plot the first N genes") p.add_option("--ymax", type="int", help="Y-axis max value") p.add_option("--log", action="store_true", help="Write plotting data") opts, args, iopts = p.set_image_options(args, figsize="6x4") if len(args) != 2: sys.exit(not p.print_help()) taggedbed, chr = args bed = Bed(taggedbed) beds = list(bed.sub_bed(chr)) old, new = [], [] i = 0 for b in beds: accn = b.extra[0] if "te" in accn: continue accn, tag = accn.split("|") if tag == "OVERLAP": continue c, r = atg_name(accn) if tag == "NEW": new.append((i, r)) else: old.append((i, r)) i += 1 ngenes = i assert ngenes == len(new) + len(old) logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr)) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) xstart, xend = 0.2, 0.8 ystart, yend = 0.2, 0.8 pad = 0.02 ngenes = opts.firstn or ngenes ymax = opts.ymax or 500000 title = "Assignment of Medtr identifiers" if opts.ymax: subtitle = "{0}, first {1} genes".format(chr, ngenes) else: subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new)) chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0, ymax, 5, title, subtitle) ax = chr_map.axes if opts.log: from jcvi.utils.table import write_csv header = ["x", "y"] write_csv(header, new, filename=chr + ".new") write_csv(header, old, filename=chr + ".old") x, y = zip(*new) ax.plot(x, y, "b,") x, y = zip(*old) ax.plot(x, y, "r,") # Legends ymid = (ystart + yend) / 2 y = ymid + pad root.plot([0.2], [y], "r.", lw=2) root.text(0.2 + pad, y, "Existing Medtr ids", va="center", size=10) y = ymid - pad root.plot([0.2], [y], "b.", lw=2) root.text(0.2 + pad, y, "Newly instantiated ids", va="center", size=10) ax.set_xlim(0, ngenes) ax.set_ylim(0, ymax) ax.set_axis_off() root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + ".identifiers." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option( "--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric", ) p.add_option( "--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature", ) g1 = OptionGroup( p, "Optional parameters (alignment):\n" + "Use if resolving ambiguities based on sequence `alignment`", ) g1.add_option( "--pid", dest="pid", default=35.0, type="float", help="Percent identity cutoff", ) g1.add_option( "--score", dest="score", default=250.0, type="float", help="Alignment score cutoff", ) p.add_option_group(g1) g2 = OptionGroup( p, "Optional parameters (overlap):\n" + "Use if resolving ambiguities based on `overlap` length\n" + "Parameters equivalent to `intersectBed`", ) g2.add_option( "-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0)", ) g2.add_option( "-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal", ) g2.add_option( "-s", dest="s", default=True, action="store_true", help="Require same strandedness", ) p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".format( opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning( "`{0}` already exists. Checking for needle output".format( pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error( "`{0}` does not exist. Please process {1} using `needle`". format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning( "`{0}' exists. Storing scores in memory".format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print(abedline[accn], file=afh) afh.close() sort([abedfile, "-i"])
def reindex(args): """ %prog reindex gffile pep.fasta ref.pep.fasta Reindex the splice isoforms (mRNA) in input GFF file, preferably generated after PASA annotation update In the input GFF file, there can be several types of mRNA within a locus: * CDS matches reference, UTR extended, inherits reference mRNA ID * CDS (slightly) different from reference, inherits reference mRNA ID * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2" * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1" In the case of multiple mRNA which have inherited the same reference mRNA ID, break ties by comparing the new protein with the reference protein using EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID. All mRNA identifiers should follow the AGI naming conventions. When reindexing the isoform identifiers, order mRNA based on: * decreasing transcript length * decreasing support from multiple input datasets used to run pasa.consolidate() """ from jcvi.formats.gff import make_index from jcvi.formats.fasta import Fasta from jcvi.apps.emboss import needle from jcvi.formats.base import FileShredder from tempfile import mkstemp p = OptionParser(reindex.__doc__) p.add_option("--scores", type="str", help="read from existing EMBOSS `needle` scores file") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gffile, pep, refpep, = args gffdb = make_index(gffile) reffasta = Fasta(refpep) if not opts.scores: fh, pairsfile = mkstemp(prefix="pairs", suffix=".txt", dir=".") fw = must_open(pairsfile, "w") conflict, novel = AutoVivification(), {} for gene in gffdb.features_of_type("gene", order_by=("seqid", "start")): geneid = atg_name(gene.id, retval="locus") novel[geneid] = [] updated_mrna, hybrid_mrna = [], [] for mrna in gffdb.children(gene, featuretype="mRNA", order_by=("seqid", "start")): if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id: pf, mrnaid = parse_prefix(mrna.id) mlen = gffdb.children_bp(mrna, child_featuretype="exon") if "-" in mrna.id: hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf))) else: updated_mrna.append((mrna.id, mrna.start, mlen, len(pf))) for mrna in sorted(updated_mrna, key=lambda k: (k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] iso = atg_name(mrnaid, retval="iso") newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid)) if iso == newiso: if iso not in conflict[geneid]: conflict[geneid][iso] = [] conflict[geneid][iso].append( (mrna[0], iso, newiso, mstart, mlen, len(pf))) else: novel[geneid].append( (mrna[0], None, newiso, mstart, mlen, len(pf))) for mrna in sorted(hybrid_mrna, key=lambda k: (k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] _iso, _newiso = [], [] for id in sorted(mrnaid.split("-")): a = atg_name(id, retval="iso") b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id)) _iso.append(a) _newiso.append(b) _novel = None newiso = "-".join(str(x) for x in set(_newiso)) for iso, niso in zip(_iso, _newiso): if iso == niso: if iso not in conflict[geneid]: conflict[geneid][iso] = [(mrna[0], iso, newiso, mstart, mlen, len(pf))] _novel = None break _novel = True if _novel is not None: novel[geneid].append( (mrna[0], None, newiso, mstart, mlen, len(pf))) if not opts.scores: for isoform in sorted(conflict[geneid]): mrnaid = "{0}.{1}".format(geneid, isoform) if mrnaid in reffasta.keys(): for mrna in conflict[geneid][isoform]: print("\t".join(str(x) for x in (mrnaid, mrna[0])), file=fw) scoresfile = None if not opts.scores: fw.close() needle([pairsfile, refpep, pep]) FileShredder([pairsfile], verbose=False) scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) else: scoresfile = opts.scores scores = read_scores(scoresfile, sort=True, trimsuffix=False) primary = {} for geneid in conflict: primary[geneid] = [] for iso in sorted(conflict[geneid]): conflict[geneid][iso].sort(key=lambda k: (k[3], -k[4], -k[5])) _iso = "{0}.{1}".format(geneid, iso) if _iso not in scores: novel[geneid].extend(conflict[geneid][iso]) continue top_score = scores[_iso][0][1] result = next( (i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None, ) if result is not None: primary[geneid].append(conflict[geneid][iso][result]) del conflict[geneid][iso][result] if geneid not in novel: novel[geneid] = [] novel[geneid].extend(conflict[geneid][iso]) novel[geneid].sort(key=lambda k: (k[3], -k[4], -k[5])) fw = must_open(opts.outfile, "w") for gene in gffdb.features_of_type("gene", order_by=("seqid", "start")): geneid = gene.id print(gene, file=fw) seen = [] if geneid in primary: all_mrna = primary[geneid] all_mrna.extend(novel[geneid]) for iso, mrna in enumerate(all_mrna): _mrna = gffdb[mrna[0]] _iso = mrna[1] if mrna not in novel[geneid]: seen.append(int(mrna[1])) else: mseen = 0 if len(seen) == 0 else max(seen) _iso = (mseen + iso + 1) - len(seen) _mrnaid = "{0}.{1}".format(geneid, _iso) _mrna["ID"], _mrna["_old_ID"] = [_mrnaid], [_mrna.id] print(_mrna, file=fw) for c in gffdb.children(_mrna, order_by=("start")): c["Parent"] = [_mrnaid] print(c, file=fw) else: for feat in gffdb.children(gene, order_by=("seqid", "start")): print(feat, file=fw) fw.close()
def instantiate(args): """ %prog instantiate tagged.bed blacklist.ids big_gaps.bed instantiate NEW genes tagged by renumber. """ p = OptionParser(instantiate.__doc__) p.set_annot_reformat_opts() p.add_option( "--extended_stride", default=False, action="store_true", help="Toggle extended strides for gene numbering", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) taggedbed, blacklist, gapsbed = args r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) r.get_blacklist(blacklist) r.get_gaps(gapsbed) # Run through the bed, identify stretch of NEW ids to instantiate, # identify the flanking FRAMEs, interpolate! bed = Bed(taggedbed) outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed" fw = open(outputbed, "w") tagkey = lambda x: x.rsplit("|", 1)[-1] for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue sbed = list(sbed) ranks = [] for i, s in enumerate(sbed): nametag = s.extra[0] tag = tagkey(nametag) if tag in (NEW, FRAME): ranks.append((i, nametag)) blocks = [] for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])): names = list(names) if tag == NEW: blocks.append((tag, [sbed[x[0]] for x in names])) else: start, end = names[0][-1], names[-1][-1] start, end = ( atg_name(start, retval="rank"), atg_name(end, retval="rank"), ) blocks.append((tag, [start, end])) id_table = {} # old to new name conversion for i, (tag, info) in enumerate(blocks): if tag != NEW: continue start_id = 0 if i == 0 else blocks[i - 1][1][-1] end_id = start_id + 10000 if i == len(blocks) - 1 else blocks[ i + 1][1][0] r.allocate( info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride, ) # Output new names for i, s in enumerate(sbed): nametag = s.extra[0] name, tag = nametag.split("|") if tag == NEW: assert name == "." name = id_table[s.accn] elif tag == OVERLAP: if name in id_table: name = id_table[name] s.extra[0] = "|".join((name, tag)) print(s, file=fw) fw.close()
def sizes(args): """ %prog sizes gaps.bed a.fasta b.fasta Take the flanks of gaps within a.fasta, map them onto b.fasta. Compile the results to the gap size estimates in b. The output is detailed below: Columns are: 1. A scaffold 2. Start position 3. End position 4. Gap identifier 5. Gap size in A (= End - Start) 6. Gap size in B (based on BLAST, see below) For each gap, I extracted the left and right sequence (mostly 2Kb, but can be shorter if it runs into another gap) flanking the gap. The flanker names look like gap.00003L and gap.00003R means the left and right flanker of this particular gap, respectively. The BLAST output is used to calculate the gap size. For each flanker sequence, I took the best hit, and calculate the inner distance between the L match range and R range. The two flankers must map with at least 98% identity, and in the same orientation. NOTE the sixth column in the list file is not always a valid number. Other values are: - na: both flankers are missing in B - Singleton: one flanker is missing - Different chr: flankers map to different scaffolds - Strand +|-: flankers map in different orientations - Negative value: the R flanker map before L flanker """ from jcvi.formats.base import DictFile from jcvi.apps.align import blast p = OptionParser(sizes.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gapsbed, afasta, bfasta = args pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" extfasta = pf + ".ext.fasta" if need_update(gapsbed, extfasta): extbed, extfasta = flanks([gapsbed, afasta]) q = op.basename(extfasta).split(".")[0] r = op.basename(bfasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) if need_update([extfasta, bfasta], blastfile): blastfile = blast([bfasta, extfasta, "--wordsize=50", "--pctid=98"]) labelsfile = blast_to_twobeds(blastfile) labels = DictFile(labelsfile, delimiter='\t') bed = Bed(gapsbed) for b in bed: b.score = b.span accn = b.accn print("\t".join((str(x) for x in (b.seqid, b.start - 1, b.end, accn, b.score, labels.get(accn, "na")))))
def tRNAscan(args): """ %prog tRNAscan all.trna > all.trna.gff3 Convert tRNAscan-SE output into gff3 format. Sequence tRNA Bounds tRNA Anti Intron Bounds Cove Name tRNA # Begin End Type Codon Begin End Score -------- ------ ---- ------ ---- ----- ----- ---- ------ 23231 1 335355 335440 Tyr GTA 335392 335404 69.21 23231 2 1076190 1076270 Leu AAG 0 0 66.33 Conversion based on PERL one-liner in: <https://github.com/sujaikumar/assemblage/blob/master/README-annotation.md> """ from jcvi.formats.gff import sort p = OptionParser(tRNAscan.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (trnaout, ) = args gffout = trnaout + ".gff3" fp = open(trnaout) fw = open(gffout, "w") next(fp) next(fp) row = next(fp) assert row.startswith("--------") for row in fp: atoms = [x.strip() for x in row.split("\t")] contig, trnanum, start, end, aa, codon, intron_start, intron_end, score = atoms start, end = int(start), int(end) orientation = "+" if start > end: start, end = end, start orientation = "-" source = "tRNAscan" type = "tRNA" if codon == "???": codon = "XXX" comment = "ID={0}.tRNA.{1};Name=tRNA-{2} (anticodon: {3})".format( contig, trnanum, aa, codon) print( "\t".join( str(x) for x in ( contig, source, type, start, end, score, orientation, ".", comment, )), file=fw, ) fw.close() sort([gffout, "-i"])
def prepare(args): """ %prog prepare mcscanfile cdsfile [options] Pick sequences from cdsfile to form fasta files, according to multiple alignment in the mcscanfile. The fasta sequences can then be used to construct phylogenetic tree. Use --addtandem=tandemfile to collapse tandems of anchors into single row. The tandemfile must be provided with *ALL* genomes involved, otherwise result will be incomplete and redundant. """ from jcvi.graphics.base import discrete_rainbow p = OptionParser(prepare.__doc__) p.add_option("--addtandem", help="path to tandemfile") p.add_option( "--writecolors", default=False, action="store_true", help="generate a gene_name to color mapping file which will be taken " "by jcvi.apps.phylo.draw", ) p.set_outdir(outdir="sequences") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mcscanfile, cdsfile = args if opts.addtandem: tandemfile = opts.addtandem mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile) mcscanfile = mcscanfile_with_tandems seqdir = opts.outdir mkdir(seqdir) f = Fasta(cdsfile) fp = must_open(mcscanfile) if opts.writecolors: fc = must_open("leafcolors.txt", "w") n = 0 for i, row in enumerate(fp): row = row.strip().split("\t") if i == 0: l = len(row) if l <= 20: colors = discrete_rainbow(l, shuffle=False)[1] else: colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1] warnings.warn( "*** WARNING ***\n" "Too many columns. Colors may not be all distinctive.") assert len(row) == l, "All rows should have same number of fields." anchors = set() for j, atom in enumerate(row): color = "%s,%s,%s" % colors[j] if atom == ".": continue elif "," in atom: atom = atom.split(",") for a in atom: fc.write("{0}\t{1}\n".format(a, color)) anchors.add(a) else: fc.write("{0}\t{1}\n".format(atom, color)) anchors.add(atom) if len(anchors) <= 3: print( "Not enough seqs to build trees for {0}".format(anchors), file=sys.stderr, ) continue pivot = row[0] fw = must_open("%s/%s.cds" % (seqdir, pivot), "w") for a in anchors: if a not in f: print(a) a = find_first_isoform(a, f) assert a, a arec = f[a] SeqIO.write((arec), fw, "fasta") fw.close() n += 1 if opts.writecolors: fc.close() logging.debug("leaf colors written to `{0}`".format(fc.name)) logging.debug("cds of {0} syntelog groups written to {1}/".format( n, seqdir)) return seqdir
def annotate(args): """ %prog annotate agpfile gaps.linkage.bed assembly.fasta Annotate AGP file with linkage info of `paired-end` or `map`. File `gaps.linkage.bed` is generated by assembly.gaps.estimate(). """ from jcvi.formats.agp import AGP, bed, tidy p = OptionParser(annotate.__doc__) p.add_option("--minsize", default=200, help="Smallest component size [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, linkagebed, assemblyfasta = args linkagebed = Bed(linkagebed) spannedgaps = set() for b in linkagebed: score = int(b.score) if score == 0: spannedgaps.add((b.accn, b.start, b.end)) agp = AGP(agpfile) newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp" newagp = open(newagpfile, "w") contig_id = 0 minsize = opts.minsize for a in agp: if not a.is_gap: cs = a.component_span if cs < minsize: a.is_gap = True a.component_type = "N" a.gap_length = cs a.gap_type = "scaffold" a.linkage = "yes" a.linkage_evidence = [] else: contig_id += 1 a.component_id = "contig{0:04d}".format(contig_id) a.component_beg = 1 a.component_end = cs a.component_type = "W" print(a, file=newagp) continue gapinfo = (a.object, a.object_beg, a.object_end) gaplen = a.gap_length if gaplen == 100 and gapinfo not in spannedgaps: a.component_type = "U" tag = "map" else: tag = "paired-ends" a.linkage_evidence.append(tag) print(a, file=newagp) newagp.close() logging.debug("Annotated AGP written to `{0}`.".format(newagpfile)) contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed" bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed]) contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True) tidy([newagpfile, contigfasta])
def draw(args): """ %prog draw --input newicktrees [options] Draw phylogenetic trees into single or combined plots. Input trees should be one of the following: 1. single Newick format tree file 2. a dir containing *ONLY* the tree files to be drawn Newick format: http://evolution.genetics.washington.edu/phylip/newicktree.html This function wraps on jcvi.graphics.tree This function is better used for trees generated by jcvi.apps.phylo (rooted if possible). For drawing general Newick trees from external sources invoke jcvi.graphics.tree directly, which also gives more drawing options. """ trunc_name_options = ["headn", "oheadn", "tailn", "otailn"] p = OptionParser(draw.__doc__) p.add_option( "--input", help="path to single input tree file or a dir " "containing ONLY the input tree files", ) p.add_option( "--combine", type="string", default="1x1", help="combine multiple trees into one plot in nrowxncol", ) p.add_option( "--trunc_name", default=None, help="Options are: {0}. " "truncate first n chars, retains only first n chars, " "truncate last n chars, retain only last chars. " "n=1~99.".format(trunc_name_options), ) p.add_option( "--SH", default=None, help="path to a file containing SH test p-values in format:" "tree_file_name<tab>p-values " "This file can be generated with jcvi.apps.phylo build", ) p.add_option( "--scutoff", default=50, type="int", help="cutoff for displaying node support, 0-100", ) p.add_option( "--barcode", default=None, help="path to seq/taxon name barcode mapping file: " "barcode<tab>new_name " "This option is downstream of `--trunc_name`", ) p.add_option( "--leafcolorfile", default=None, help="path to a mapping file containing font colors " "for the OTUs: leafname<tab>color", ) p.set_outdir() opts, args, iopts = p.set_image_options(figsize="8x6") input = opts.input outdir = opts.outdir combine = opts.combine.split("x") trunc_name = opts.trunc_name SH = opts.SH mkdir(outdir) if not input: sys.exit(not p.print_help()) elif op.isfile(input): trees_file = input treenames = [op.basename(input)] elif op.isdir(input): trees_file = op.join(outdir, "alltrees.dnd") treenames = [] for f in sorted(os.listdir(input)): sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False) treenames.append(f) else: sys.exit(not p.print_help()) trees = OrderedDict() tree = "" i = 0 for row in LineFile(trees_file, comment="#", load=True).lines: if i == len(treenames): break if not len(row): continue if ";" in row: # sanity check if row.index(";") != len(row) - 1: ts = row.split(";") for ii in range(len(ts) - 1): ts[ii] += ";" else: ts = [row] for t in ts: if ";" in t: tree += t if tree: trees[treenames[i]] = tree tree = "" i += 1 else: tree += t else: tree += row logging.debug("A total of {0} trees imported.".format(len(trees))) sh("rm {0}".format(op.join(outdir, "alltrees.dnd"))) _draw_trees( trees, nrow=int(combine[0]), ncol=int(combine[1]), rmargin=0.3, iopts=iopts, outdir=outdir, shfile=SH, trunc_name=trunc_name, scutoff=opts.scutoff, barcodefile=opts.barcode, leafcolorfile=opts.leafcolorfile, )
def build(args): """ %prog build [prot.fasta] cds.fasta [options] --outdir=outdir This function wraps on the following steps: 1. msa using ClustalW2 or MUSCLE(default) 2. (optional) alignment editing using Gblocks 3. build NJ tree using PHYLIP in EMBOSS package seq names should be unique by first 10 chars (restriction of PHYLIP) 4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml, *WARNING* maybe slow with large dataset If an outgroup file is provided, the result tree will be rooted on the outgroup according to order in the file, i.e. the name in row1 will be tried first. If not found, row2 will be used, etc. Tail truncated names can be provided so long as it is unique among the seqs. If not uniq, the first occurrence will be used. For example, if you have two moss sequences in your input, then the tree will be rooted on the first moss sequence encountered by the program, unless they are monophylic, in which case the root will be their common ancestor. --stree and --smap are required if --treefix is set. Trees can be edited again using an editor such as Dendroscope. This is the recommended way to get highly customized trees. Newick format trees will be deposited into outdir (. by default). """ from jcvi.formats.fasta import translate p = OptionParser(build.__doc__) p.add_option( "--longest", action="store_true", help="Get longest ORF, only works if no pep file, e.g. ESTs", ) p.add_option( "--nogblocks", action="store_true", help="don't use Gblocks to edit alignment", ) p.add_option( "--synonymous", action="store_true", help="extract synonymous sites of the alignment", ) p.add_option( "--fourfold", action="store_true", help="extract fourfold degenerate sites of the alignment", ) p.add_option( "--msa", default="muscle", choices=("clustalw", "muscle"), help="software used to align the proteins", ) p.add_option( "--noneighbor", action="store_true", help="don't build NJ tree", ) p.add_option( "--ml", default=None, choices=("raxml", "phyml"), help="software used to build ML tree", ) p.add_option("--outgroup", help="path to file containing outgroup orders") p.add_option("--SH", help="path to reference Newick tree") p.add_option("--shout", default="SH_out.txt", help="SH output file name") p.add_option( "--treefix", action="store_true", help="use TreeFix to rearrange ML tree", ) p.add_option("--stree", help="path to species Newick tree") p.add_option( "--smap", help="path to smap file: gene_name_pattern<tab>species_name", ) p.set_outdir() opts, args = p.parse_args(args) gblocks = not opts.nogblocks synonymous = opts.synonymous fourfold = opts.fourfold neighbor = not opts.noneighbor outgroup = opts.outgroup outdir = opts.outdir if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) if opts.treefix: stree = opts.stree smap = opts.smap assert stree and smap, "TreeFix requires stree and smap files." opts.ml = "raxml" treedir = op.join(outdir, "tree") mkdir(treedir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) work_dir = op.join(outdir, "alignment") mkdir(work_dir) p_recs = list(SeqIO.parse(open(protein_file), "fasta")) if opts.msa == "clustalw": align_fasta = clustal_align_protein(p_recs, work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein(p_recs, work_dir) n_recs = list(SeqIO.parse(open(dna_file), "fasta")) mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta") if not mrtrans_fasta: logging.debug("pal2nal aborted. Cannot reliably build tree for %s", dna_file) return codon_aln_fasta = mrtrans_fasta if gblocks: gb_fasta = run_gblocks(mrtrans_fasta) codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta else: if synonymous: codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous") if fourfold: codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold") if not neighbor and not opts.ml: return codon_aln_fasta alignment = AlignIO.read(codon_aln_fasta, "fasta") if len(alignment) <= 3: raise ValueError("Too few seqs to build tree.") mkdir(op.join(treedir, "work")) if neighbor: out_file = op.join( treedir, op.basename(dna_file).rsplit(".", 1)[0] + ".NJ.unrooted.dnd") try: outfile, phy_file = build_nj_phylip(alignment, outfile=out_file, outgroup=outgroup, work_dir=treedir) except: print("NJ tree cannot be built for {0}".format(dna_file)) if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.ml: out_file = op.join( treedir, op.basename(dna_file).rsplit(".", 1)[0] + ".ML.unrooted.dnd") if opts.ml == "phyml": try: outfile, phy_file = build_ml_phyml(alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) elif opts.ml == "raxml": try: outfile, phy_file = build_ml_raxml(alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) if outgroup: new_out_file = out_file.replace(".unrooted", "") t = smart_reroot(treefile=out_file, outgroupfile=outgroup, outfile=new_out_file) if t == new_out_file: sh("rm %s" % out_file) outfile = new_out_file if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.treefix: treefix_dir = op.join(treedir, "treefix") assert mkdir(treefix_dir, overwrite=True) sh("cp {0} {1}/".format(outfile, treefix_dir)) input = op.join(treefix_dir, op.basename(outfile)) aln_file = input.rsplit(".", 1)[0] + ".fasta" SeqIO.write(alignment, aln_file, "fasta") outfile = run_treefix( input=input, stree_file=stree, smap_file=smap, a_ext=".fasta", o_ext=".dnd", n_ext=".treefix.dnd", ) return outfile