def subset(args): """ %prog subset blastfile qbedfile sbedfile Extract blast hits between given query and subject chrs. If --qchrs or --schrs is not given, then all chrs from q/s genome will be included. However one of --qchrs and --schrs must be specified. Otherwise the script will do nothing. """ p = OptionParser(subset.__doc__) p.add_option("--qchrs", default=None, help="query chrs to extract, comma sep [default: %default]") p.add_option("--schrs", default=None, help="subject chrs to extract, comma sep [default: %default]") p.add_option("--convert", default=False, action="store_true", help="convert accns to chr_rank [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, qbedfile, sbedfile = args qchrs = opts.qchrs schrs = opts.schrs assert qchrs or schrs, p.print_help() convert = opts.convert outfile = blastfile + "." if qchrs: outfile += qchrs + "." qchrs = set(qchrs.split(",")) else: qchrs = set(Bed(qbedfile).seqids) if schrs: schrs = set(schrs.split(",")) if qbedfile != sbedfile or qchrs != schrs: outfile += ",".join(schrs) + "." else: schrs = set(Bed(sbedfile).seqids) outfile += "blast" qo = Bed(qbedfile).order so = Bed(sbedfile).order fw = must_open(outfile, "w") for b in Blast(blastfile): q, s = b.query, b.subject if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs: if convert: b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0]) b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0]) print >> fw, b fw.close() logging.debug("Subset blastfile written to `{0}`".format(outfile))
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def traits(args): """ %prog traits directory Make HTML page that reports eye and skin color. """ p = OptionParser(traits.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) samples = [] for folder in args: targets = iglob(folder, "*-traits.json") if not targets: continue filename = targets[0] js = json.load(open(filename)) js["skin_rgb"] = make_rgb( js["traits"]["skin-color"]["L"], js["traits"]["skin-color"]["A"], js["traits"]["skin-color"]["B"]) js["eye_rgb"] = make_rgb( js["traits"]["eye-color"]["L"], js["traits"]["eye-color"]["A"], js["traits"]["eye-color"]["B"]) samples.append(js) template = Template(traits_template) fw = open("report.html", "w") print >> fw, template.render(samples=samples) logging.debug("Report written to `{}`".format(fw.name)) fw.close()
def diff(args): """ %prog diff simplefile Calculate difference of pairwise syntenic regions. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(diff.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) simplefile, = args fp = open(simplefile) data = [x.split() for x in fp] spans = [] for block_id, ab in groupby(data[1:], key=lambda x: x[0]): a, b = list(ab) aspan, bspan = a[4], b[4] aspan, bspan = int(aspan), int(bspan) spans.append((aspan, bspan)) aspans, bspans = zip(*spans) dspans = [b - a for a, b, in spans] s = SummaryStats(dspans) print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans)) print >> sys.stderr, "Sum of A: {0}".format(sum(aspans)) print >> sys.stderr, "Sum of B: {0}".format(sum(bspans)) print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
def compile(args): """ %prog compile directory Extract telomere length and ccn. """ p = OptionParser(compile.__doc__) p.set_outfile(outfile="age.tsv") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) dfs = [] for folder in args: ofolder = os.listdir(folder) # telomeres subdir = [x for x in ofolder if x.startswith("telomeres")][0] subdir = op.join(folder, subdir) filename = op.join(subdir, "tel_lengths.txt") df = pd.read_csv(filename, sep="\t") d1 = df.ix[0].to_dict() # ccn subdir = [x for x in ofolder if x.startswith("ccn")][0] subdir = op.join(folder, subdir) filename = iglob(subdir, "*.ccn.json")[0] js = json.load(open(filename)) d1.update(js) df = pd.DataFrame(d1, index=[0]) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df.to_csv(opts.outfile, sep="\t", index=False)
def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def summary(args): """ %prog summary *.gff Print gene statistics table. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args for metric in metrics: logging.debug("Parsing files in `{0}`..".format(metric)) table = {} for x in gff_files: pf = op.basename(x).split(".")[0] numberfile = op.join(metric, pf + ".txt") ar = [int(x.strip()) for x in open(numberfile)] sum = SummaryStats(ar).todict().items() keys, vals = zip(*sum) keys = [(pf, x) for x in keys] table.update(dict(zip(keys, vals))) print >> sys.stderr, tabulate(table)
def histogram(args): """ %prog histogram *.gff Plot gene statistics based on output of stats. For each gff file, look to see if the metrics folder (i.e. Exon_Length) contains the data and plot them. """ from jcvi.graphics.histogram import histogram_multiple p = OptionParser(histogram.__doc__) p.add_option("--bins", dest="bins", default=40, type="int", help="number of bins to plot in the histogram [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count") colors = ("red", "green", "blue", "black") vmaxes = (1000, 1000, 4000, 20) xlabels = ("bp", "bp", "bp", "number") for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels): logging.debug("Parsing files in `{0}`..".format(metric)) numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \ for x in gff_files] histogram_multiple(numberfiles, 0, vmax, xlabel, metric, bins=opts.bins, facet=True, fill=color, prefix=metric + ".")
def unitigs(args): """ %prog unitigs best.edges Reads Celera Assembler's "best.edges" and extract all unitigs. """ p = OptionParser(unitigs.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args G = read_graph(bestedges, maxerr=opts.maxerr, directed=True) H = nx.Graph() intconv = lambda x: int(x.split("-")[0]) for k, v in G.iteritems(): if k == G.get(v, None): H.add_edge(intconv(k), intconv(v)) nunitigs = nreads = 0 for h in nx.connected_component_subgraphs(H, copy=False): st = [x for x in h if h.degree(x) == 1] if len(st) != 2: continue src, target = st path = list(nx.all_simple_paths(h, src, target)) assert len(path) == 1 path, = path print "|".join(str(x) for x in path) nunitigs += 1 nreads += len(path) logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
def tracedb(args): """ %prog tracedb <xml|lib|frg> Run `tracedb-to-frg.pl` within current folder. """ p = OptionParser(tracedb.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) action, = args assert action in ("xml", "lib", "frg") CMD = "tracedb-to-frg.pl" xmls = glob("xml*") if action == "xml": for xml in xmls: cmd = CMD + " -xml {0}".format(xml) sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True) elif action == "lib": cmd = CMD + " -lib {0}".format(" ".join(xmls)) sh(cmd) elif action == "frg": for xml in xmls: cmd = CMD + " -frg {0}".format(xml) sh(cmd, background=True)
def ids(args): """ %prog ids cdhit.clstr Get the representative ids from clstr file. """ p = OptionParser(ids.__doc__) p.add_option("--prefix", type="int", help="Find rep id for prefix of len [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clstrfile, = args cf = ClstrFile(clstrfile) prefix = opts.prefix if prefix: reads = list(cf.iter_reps_prefix(prefix=prefix)) else: reads = list(cf.iter_reps()) nreads = len(reads) idsfile = clstrfile.replace(".clstr", ".ids") fw = open(idsfile, "w") for i, name in reads: print("\t".join(str(x) for x in (i, name)), file=fw) logging.debug("A total of {0} unique reads written to `{1}`.".\ format(nreads, idsfile)) fw.close() return idsfile
def csv(args): """ %prog csv excelfile Convert EXCEL to csv file. """ from xlrd import open_workbook p = OptionParser(csv.__doc__) p.set_sep(sep=',') opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) excelfile, = args sep = opts.sep csvfile = excelfile.rsplit(".", 1)[0] + ".csv" wb = open_workbook(excelfile) fw = open(csvfile, "w") for s in wb.sheets(): print >> sys.stderr, 'Sheet:',s.name for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(s.cell(row, col).value) print >> fw, sep.join(str(x) for x in values)
def main(): """ %prog numbers1.txt number2.txt ... Print histogram of the data files. The data files contain one number per line. If more than one file is inputted, the program will combine the histograms into the same plot. """ allowed_format = ("emf", "eps", "pdf", "png", "ps", \ "raw", "rgba", "svg", "svgz") p = OptionParser(main.__doc__) p.add_option("--skip", default=0, type="int", help="skip the first several lines [default: %default]") p.set_histogram() p.add_option("--tags", dest="tags", default=None, help="tags for data if multiple input files, comma sep") p.add_option("--ascii", default=False, action="store_true", help="print ASCII text stem-leaf plot [default: %default]") p.add_option("--base", default="0", choices=("0", "2", "10"), help="use logarithm axis with base, 0 to disable [default: %default]") p.add_option("--facet", default=False, action="store_true", help="place multiple histograms side-by-side [default: %default]") p.add_option("--fill", default="white", help="color of the bin [default: %default]") p.add_option("--format", default="pdf", choices=allowed_format, help="Generate image of format [default: %default]") p.add_option("--quick", default=False, action="store_true", help="Use quick plot, assuming bins are already counted") p.add_option("--noprintstats", default=False, action="store_true", help="Write basic stats when using --quick") opts, args = p.parse_args() if len(args) < 1: sys.exit(not p.print_help()) skip = opts.skip vmin, vmax = opts.vmin, opts.vmax bins = opts.bins xlabel, title = opts.xlabel, opts.title title = title or args[0] base = int(opts.base) fileno = len(args) if opts.quick: assert fileno == 1, "Single input file expected using --quick" filename = args[0] figname = filename.rsplit(".", 1)[0] + ".pdf" data = DictFile(filename, keycast=int, cast=int) quickplot(data, vmin, vmax, xlabel, title, figname=figname, print_stats=(not opts.noprintstats)) return if fileno == 1: histogram(args[0], vmin, vmax, xlabel, title, outfmt=opts.format, bins=bins, skip=skip, ascii=opts.ascii, base=base, fill=opts.fill) else: histogram_multiple(args, vmin, vmax, xlabel, title, outfmt=opts.format, tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii, facet=opts.facet, fill=opts.fill)
def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=None, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args q = op.basename(queryfasta).split(".")[0] r = op.basename(reffasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def passthrough(args): """ %prog passthrough chrY.vcf chrY.new.vcf Pass through Y and MT vcf. """ p = OptionParser(passthrough.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, newvcffile = args fp = open(vcffile) fw = open(newvcffile, "w") gg = ["0/0", "0/1", "1/1"] for row in fp: if row[0] == "#": print(row.strip(), file=fw) continue v = VcfLine(row) v.filter = "PASS" v.format = "GT:GP" probs = [0] * 3 probs[gg.index(v.genotype)] = 1 v.genotype = v.genotype.replace("/", "|") + \ ":{0}".format(",".join("{0:.3f}".format(x) for x in probs)) print(v, file=fw) fw.close()
def agp(args): """ %prog agp <fastafile|sizesfile> Convert the sizes file to a trivial AGP file. """ from jcvi.formats.agp import OO p = OptionParser(agp.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) sizesfile, = args sizes = Sizes(sizesfile) agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp" fw = open(agpfile, "w") o = OO() # Without a filename for ctg, size in sizes.iter_sizes(): o.add(ctg, ctg, size) o.write_AGP(fw) fw.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) return agpfile
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def beagle(args): """ %prog beagle input.vcf 1 Use BEAGLE4.1 to impute vcf on chromosome 1. """ p = OptionParser(beagle.__doc__) p.set_home("beagle") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, chr = args pf = vcffile.rsplit(".", 1)[0] outpf = pf + ".beagle" outfile = outpf + ".vcf.gz" mm = MakeManager() beagle_cmd = opts.beagle_home kg = op.join(opts.ref, "1000GP_Phase3") cmd = beagle_cmd + " gt={0}".format(vcffile) cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr) cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr) cmd += " out={0}".format(outpf) cmd += " nthreads=16 gprobs=true" mm.add(vcffile, outfile, cmd) mm.write()
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def fromimpute2(args): """ %prog fromimpute2 impute2file fastafile 1 Convert impute2 output to vcf file. Imputed file looks like: --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002 """ p = OptionParser(fromimpute2.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) impute2file, fastafile, chr = args fasta = Fasta(fastafile) print get_vcfstanza(fastafile, fasta) fp = open(impute2file) seen = set() for row in fp: snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split() pos = int(pos) if pos in seen: continue seen.add(pos) code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1] tag = "PR" if snp_id == chr else "IM" print "\t".join(str(x) for x in \ (chr, pos, rsid, ref, alt, ".", ".", tag, \ "GT:GP", code + ":" + ",".join((aa, ab, bb))))
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print row.strip() continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print vv[0] continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print bestv
def sample(args): """ %prog sample vcffile 0.9 Sample subset of vcf file. """ from random import random p = OptionParser(sample.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, ratio = args ratio = float(ratio) fp = open(vcffile) pf = vcffile.rsplit(".", 1)[0] kept = pf + ".kept.vcf" withheld = pf + ".withheld.vcf" fwk = open(kept, "w") fww = open(withheld, "w") nkept = nwithheld = 0 for row in fp: if row[0] == '#': print >> fwk, row.strip() continue if random() < ratio: nkept += 1 print >> fwk, row.strip() else: nwithheld += 1 print >> fww, row.strip() logging.debug("{0} records kept to `{1}`".format(nkept, kept)) logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def summary(args): """ %prog summary old.new.chain old.fasta new.fasta Provide stats of the chain file. """ from jcvi.formats.fasta import summary as fsummary from jcvi.utils.cbook import percentage, human_size p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) chainfile, oldfasta, newfasta = args chain = Chain(chainfile) ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq print >> sys.stderr, "File `{0}` contains {1} chains.".\ format(chainfile, len(chain)) print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\ format(human_size(ungapped), human_size(dt), human_size(dq)) oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"]) print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\ format(oldfasta, percentage(ungapped, oldreal)) newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"]) print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\ format(newfasta, percentage(ungapped, newreal))
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join(str(x) for x in ( chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id )) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from more_itertools import grouper from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join p = OptionParser(scaffold.__doc__) p.set_rclip(rclip=1) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]", ) opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) rclip = opts.rclip ctgfasta = args[0] duos = list(grouper(args[1:], 2)) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [ bedfile, "--lib", "--nointra", "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff), ] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".format( prefix) sh(cmd) final = "final" cmd = ("printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " "-merge -detail -oo -sum -o {1}".format(prefix, final)) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])
def calibrate(args): """ %prog calibrate calibrate.JPG boxsize Calibrate pixel-inch ratio and color adjustment. - `calibrate.JPG` is the photo containig a colorchecker - `boxsize` is the measured size for the boxes on printed colorchecker, in squared centimeter (cm2) units """ xargs = args[2:] p = OptionParser(calibrate.__doc__) opts, args, iopts = add_seeds_options(p, args) if len(args) != 2: sys.exit(not p.print_help()) imagefile, boxsize = args boxsize = float(boxsize) # Read in color checker colorcheckerfile = op.join(datadir, "colorchecker.txt") colorchecker = [] expected = 0 for row in open(colorcheckerfile): boxes = row.split() colorchecker.append(boxes) expected += len(boxes) folder = op.split(imagefile)[0] objects = seeds([imagefile, "--outdir={0}".format(folder)] + xargs) nseeds = len(objects) logging.debug("Found {0} boxes (expected={1})".format(nseeds, expected)) assert expected - 4 <= nseeds <= expected + 4, \ "Number of boxes drastically different from {0}".format(expected) # Calculate pixel-cm ratio boxes = [t.area for t in objects] reject = reject_outliers(boxes) retained_boxes = [b for r, b in zip(reject, boxes) if not r] mbox = np.median(retained_boxes) # in pixels pixel_cm_ratio = (mbox / boxsize) ** .5 logging.debug("Median box size: {0} pixels. Measured box size: {1} cm2".\ format(mbox, boxsize)) logging.debug("Pixel-cm ratio: {0}".format(pixel_cm_ratio)) xs = [t.x for t in objects] ys = [t.y for t in objects] idx_xs = get_kmeans(xs, 6) idx_ys = get_kmeans(ys, 4) for xi, yi, s in zip(idx_xs, idx_ys, objects): s.rank = (yi, xi) objects.sort(key=lambda x: x.rank) colormap = [] for s in objects: x, y = s.rank observed, expected = s.rgb, rgb_to_triplet(colorchecker[x][y]) colormap.append((np.array(observed), np.array(expected))) # Color transfer tr0 = np.eye(3).flatten() print >> sys.stderr, "Initial distance:", total_error(tr0, colormap) tr = fmin(total_error, tr0, args=(colormap,)) tr.resize((3, 3)) print >> sys.stderr, "RGB linear transform:\n", tr calib = {"PixelCMratio": pixel_cm_ratio, "RGBtransform": tr.tolist()} jsonfile = op.join(folder, "calibrate.json") fw = must_open(jsonfile, "w") print >> fw, json.dumps(calib, indent=4) fw.close() logging.debug("Calibration specs written to `{0}`.".format(jsonfile)) return jsonfile
def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option( "--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists", ) p.add_option( "--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size", ) p.add_option( "--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution", ) opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print(header, file=fw) for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append( (fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format( op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(range(0, len(a50), stepsize), a50[::stepsize]): print("\t".join((str(i), str(s / 1000000.0), tag)), file=fw) fw.close() table = loadtable(statsheader, statsrows) print(table, file=sys.stderr) generate_plot(rplot)
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs length. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import pandas as pd import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=1000000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") p.add_option("--color", default="m", help="Color of the data points") p.add_option( "--kind", default="scatter", choices=("scatter", "reg", "resid", "kde", "hex"), help="Kind of plot to draw", ) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov.get(ctg, 0) if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) df = pd.DataFrame() xlab, ylab = "Length", "Coverage of depth (X)" df[xlab] = x df[ylab] = y sns.jointplot( xlab, ylab, kind=opts.kind, data=df, xlim=(0, maxsize), ylim=(0, maxcov), stat_func=None, edgecolor="w", color=opts.color, ) figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def qc(args): """ %prog qc prefix Expects data files including: 1. `prefix.bedpe` draws Bezier curve between paired reads 2. `prefix.sizes` draws length of the contig/scaffold 3. `prefix.gaps.bed` mark the position of the gaps in sequence 4. `prefix.bed.coverage` plots the base coverage 5. `prefix.pairs.bed.coverage` plots the clone coverage See assembly.coverage.posmap() for the generation of these files. """ from jcvi.graphics.glyph import Bezier p = OptionParser(qc.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (prefix, ) = args scf = prefix # All these files *must* be present in the current folder bedpefile = prefix + ".bedpe" fastafile = prefix + ".fasta" sizesfile = prefix + ".sizes" gapsbedfile = prefix + ".gaps.bed" bedfile = prefix + ".bed" bedpefile = prefix + ".bedpe" pairsbedfile = prefix + ".pairs.bed" sizes = Sizes(fastafile).mapping size = sizes[scf] fig = plt.figure(1, (8, 5)) root = fig.add_axes([0, 0, 1, 1]) # the scaffold root.add_patch(Rectangle((0.1, 0.15), 0.8, 0.03, fc="k")) # basecoverage and matecoverage ax = fig.add_axes([0.1, 0.45, 0.8, 0.45]) bins = 200 # Smooth the curve basecoverage = Coverage(bedfile, sizesfile) matecoverage = Coverage(pairsbedfile, sizesfile) x, y = basecoverage.get_plot_data(scf, bins=bins) (baseline, ) = ax.plot(x, y, "g-") x, y = matecoverage.get_plot_data(scf, bins=bins) (mateline, ) = ax.plot(x, y, "r-") legends = ("Base coverage", "Mate coverage") leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) ax.set_xlim(0, size) # draw the read pairs fp = open(bedpefile) pairs = [] for row in fp: scf, astart, aend, scf, bstart, bend, clonename = row.split() astart, bstart = int(astart), int(bstart) aend, bend = int(aend), int(bend) start = min(astart, bstart) + 1 end = max(aend, bend) pairs.append((start, end)) bpratio = 0.8 / size cutoff = 1000 # inserts smaller than this are not plotted # this convert from base => x-coordinate pos = lambda x: (0.1 + x * bpratio) ypos = 0.15 + 0.03 for start, end in pairs: dist = end - start if dist < cutoff: continue dist = min(dist, 10000) # 10Kb == .25 canvas height height = 0.25 * dist / 10000 xstart = pos(start) xend = pos(end) p0 = (xstart, ypos) p1 = (xstart, ypos + height) p2 = (xend, ypos + height) p3 = (xend, ypos) Bezier(root, p0, p1, p2, p3) # gaps on the scaffold fp = open(gapsbedfile) for row in fp: b = BedLine(row) start, end = b.start, b.end xstart = pos(start) xend = pos(end) root.add_patch(Rectangle((xstart, 0.15), xend - xstart, 0.03, fc="w")) root.text(0.5, 0.1, scf, color="b", ha="center") warn_msg = "Only the inserts > {0}bp are shown".format(cutoff) root.text(0.5, 0.1, scf, color="b", ha="center") root.text(0.5, 0.05, warn_msg, color="gray", ha="center") # clean up and output set_human_base_axis(ax) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = prefix + ".pdf" savefig(figname, dpi=300)
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option( "--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than", ) p.add_option( "--highlights", help="A set of regions in BED format to highlight", ) opts, args, iopts = p.set_image_options(args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(args[1:], 3)) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold( scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights, )
def coverage(args): """ %prog coverage fastafile ctg bedfile1 bedfile2 .. Plot coverage from a set of BED files that contain the read mappings. The paired read span will be converted to a new bedfile that contain the happy mates. ctg is the chr/scf/ctg that you want to plot the histogram on. If the bedfiles already contain the clone spans, turn on --spans. """ from jcvi.formats.bed import mates, bedpe p = OptionParser(coverage.__doc__) p.add_option("--ymax", default=None, type="int", help="Limit ymax") p.add_option( "--spans", default=False, action="store_true", help="BED files already contain clone spans", ) opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) < 3: sys.exit(not p.print_help()) fastafile, ctg = args[0:2] bedfiles = args[2:] sizes = Sizes(fastafile) size = sizes.mapping[ctg] plt.figure(1, (iopts.w, iopts.h)) ax = plt.gca() bins = 100 # smooth the curve lines = [] legends = [] not_covered = [] yy = 0.9 for bedfile, c in zip(bedfiles, "rgbcky"): if not opts.spans: pf = bedfile.rsplit(".", 1)[0] matesfile = pf + ".mates" if need_update(bedfile, matesfile): matesfile, matesbedfile = mates([bedfile, "--lib"]) bedspanfile = pf + ".spans.bed" if need_update(matesfile, bedspanfile): bedpefile, bedspanfile = bedpe( [bedfile, "--span", "--mates={0}".format(matesfile)]) bedfile = bedspanfile bedsum = Bed(bedfile).sum(seqid=ctg) notcoveredbases = size - bedsum legend = bedfile.split(".")[0] msg = "{0}: {1} bp not covered".format(legend, thousands(notcoveredbases)) not_covered.append(msg) print(msg, file=sys.stderr) ax.text(0.1, yy, msg, color=c, size=9, transform=ax.transAxes) yy -= 0.08 cov = Coverage(bedfile, sizes.filename) x, y = cov.get_plot_data(ctg, bins=bins) (line, ) = ax.plot(x, y, "-", color=c, lw=2, alpha=0.5) lines.append(line) legends.append(legend) leg = ax.legend(lines, legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) ylabel = "Average depth per {0}Kb".format(size / bins / 1000) ax.set_xlim(0, size) ax.set_ylim(0, opts.ymax) ax.set_xlabel(ctg) ax.set_ylabel(ylabel) set_human_base_axis(ax) figname = "{0}.{1}.pdf".format(fastafile, ctg) savefig(figname, dpi=iopts.dpi, iopts=iopts)
def subset(args): """ %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks Subset some pre-calculated ks ka values (in ksfile) according to pairs in tab delimited pairsfile/anchorfile. """ p = OptionParser(subset.__doc__) p.add_option( "--noheader", action="store_true", help="don't write ksfile header line" ) p.add_option( "--block", action="store_true", help="preserve block structure in input" ) p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, ksfiles = args[0], args[1:] noheader = opts.noheader block = opts.block if block: noheader = True outfile = opts.outfile ksvals = {} for ksfile in ksfiles: ksvals.update( dict( (line.name, line) for line in KsFile(ksfile, strip_names=opts.strip_names) ) ) fp = open(pairsfile) fw = must_open(outfile, "w") if not noheader: print(fields, file=fw) i = j = 0 for row in fp: if row[0] == "#": if block: print(row.strip(), file=fw) continue a, b = row.split()[:2] name = ";".join((a, b)) if name not in ksvals: name = ";".join((b, a)) if name not in ksvals: j += 1 print("\t".join((a, b, ".", ".")), file=fw) continue ksline = ksvals[name] if block: print("\t".join(str(x) for x in (a, b, ksline.ks)), file=fw) else: ksline.name = ";".join((a, b)) print(ksline, file=fw) i += 1 fw.close() logging.debug("{0} pairs not found in ksfiles".format(j)) logging.debug("{0} ks records written to `{1}`".format(i, outfile)) return outfile
def report(args): """ %prog report ksfile generate a report given a Ks result file (as produced by synonymous_calc.py). describe the median Ks, Ka values, as well as the distribution in stem-leaf plot """ from jcvi.utils.cbook import SummaryStats from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(report.__doc__) p.add_option( "--pdf", default=False, action="store_true", help="Generate graphic output for the histogram", ) p.add_option( "--components", default=1, type="int", help="Number of components to decompose peaks", ) add_plot_options(p) opts, args, iopts = p.set_image_options(args, figsize="5x5") if len(args) != 1: sys.exit(not p.print_help()) (ks_file,) = args data = KsFile(ks_file) ks_min = opts.vmin ks_max = opts.vmax bins = opts.bins for f in fields.split(",")[1:]: columndata = [getattr(x, f) for x in data] ks = "ks" in f if not ks: continue columndata = [x for x in columndata if ks_min <= x <= ks_max] st = SummaryStats(columndata) title = "{0} ({1}): ".format(descriptions[f], ks_file) title += "Median:{0:.3f} (1Q:{1:.3f}|3Q:{2:.3f}||".format( st.median, st.firstq, st.thirdq ) title += "Mean:{0:.3f}|Std:{1:.3f}||N:{2})".format(st.mean, st.sd, st.size) tbins = (0, ks_max, bins) if ks else (0, 0.6, 10) digit = 2 if (ks_max * 1.0 / bins) < 0.1 else 1 stem_leaf_plot(columndata, *tbins, digit=digit, title=title) if not opts.pdf: return components = opts.components data = [x.ng_ks for x in data] data = [x for x in data if ks_min <= x <= ks_max] fig = plt.figure(1, (iopts.w, iopts.h)) ax = fig.add_axes([0.12, 0.1, 0.8, 0.8]) kp = KsPlot(ax, ks_max, opts.bins, legendp=opts.legendp) kp.add_data(data, components, fill=opts.fill, fitted=opts.fit, kde=opts.kde) kp.draw(title=opts.title)
def last(args, dbtype=None): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option( "--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database", ) p.add_option("--path", help="Specify LAST path") p.add_option( "--mask", default=False, action="store_true", help="Invoke -c in lastdb" ) p.add_option( "--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format", ) p.add_option( "--minlen", default=0, type="int", help="Filter alignments by how many bases match", ) p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_outdir() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus if not dbtype: dbtype = opts.dbtype getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb( infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, lastdb_bin=lastdb_bin, dbtype=dbtype, ) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir) sh(cmd, outfile=lastfile) return lastfile
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through every gene locus and identify all cases of same and different isoforms across the different input datasets. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} gene_coords = {} mrna = AutoVivification() for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if gene.id not in gene_coords: gene_coords[gene.id] = [] gene_coords[gene.id].extend([gene.start, gene.stop]) c = list(gffdbx[dbn].children(gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene.id][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" summary = ["id"] summary.extend(gffdbx.keys()) print >> sys.stderr, "\t".join(str(x) for x in summary) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): g.join((dbn1, mrna1.id)) g.join((dbn2, mrna2.id)) fUTR, tUTR = None, None if match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2]): fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='five_prime_UTR', slop=slop) tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='three_prime_UTR', slop=slop) if fUTR and tUTR: g.join((dbn1, mrna1.id), (dbn2, mrna2.id)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id)) dbn = mrna[gene].keys()[0] gene_coords[gene].sort() _gene = gffdbx[dbn][gene] _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1] print >> fw, _gene logging.debug(list(g)) for group in g: dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] if slop: mlen = 0 for D, M in zip(dbs, mrnas): _mrna = gffdbx[D][M] _mlen = (_mrna.stop - _mrna.start) + 1 if _mlen > mlen: d, m, mlen = D, M, _mlen dbid, _mrnaid = "".join(str(x) for x in set(dbs)), [] _mrnaid = [x for x in mrnas if x not in _mrnaid] mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid)) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sys.stderr, "\t".join(str(x) for x in summary) fw.close()
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option( "--longest", action="store_true", help="Get longest ORF, only works if no pep file, e.g. ESTs", ) p.add_option( "--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins", ) p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print(fields, file=output_h) work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip( prot_iterator, prot_iterator, dna_iterator, dna_iterator ): print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr) if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous( mrtrans_fasta, work_dir ) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write( "%s\n" % ( ",".join( str(x) for x in ( pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng, ) ) ) ) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice mkdir(pasa_db) os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format( dnfasta, accn_extract, tdn) write_file(runfile, accn_extract_cmd, append=True) \ if prepare else sh(accn_extract_cmd) else: transcripts = dnfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: write_file(runfile, cleancmd, append=True) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice) aafw.close() aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome) aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \ " -t {0} ".format(transcripts) if ggfasta: aacmd += " --TDN {0} ".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: write_file(runfile, aacmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format( compreh_pctid, compreh_pctcov) if prepare: write_file(runfile, comprehcmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.entrez import fetch p = OptionParser(htg.__doc__) p.add_option("--phases", default=None, help="Use another phasefile to override [default: %default]") p.add_option("--comment", default="", help="Comments for this update [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) fetch([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + \ ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) fetch([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase(glob("{0}/*".format(gbdir)) + \ ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "{qualifiers}"' acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print >> newphasefw, "{0}\t{1}\t{2}".\ format(accession_nv, oldphase, phase) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format(accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
def longest(args): """ %prog longest pasa.fasta output.subclusters.out Find the longest PASA assembly and label it as full-length. Also removes transcripts shorter than half the length of the longest, or shorter than 200bp. The assemblies for the same locus is found in `output.subclusters.out`. In particular the lines that look like: sub-cluster: asmbl_25 asmbl_26 asmbl_27 """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--prefix", default="pasa", help="Replace asmbl_ with prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, subclusters = args prefix = fastafile.rsplit(".", 1)[0] idsfile = prefix + ".fl.ids" fw = open(idsfile, "w") sizes = Sizes(fastafile).mapping name_convert = lambda x: x.replace("asmbl", opts.prefix) keep = set() # List of IDs to write fp = open(subclusters) nrecs = 0 for row in fp: if not row.startswith("sub-cluster:"): continue asmbls = row.split()[1:] longest_asmbl = max(asmbls, key=lambda x: sizes[x]) longest_size = sizes[longest_asmbl] print >> fw, name_convert(longest_asmbl) nrecs += 1 cutoff = max(longest_size / 2, 200) keep.update(set(x for x in asmbls if sizes[x] >= cutoff)) fw.close() logging.debug("{0} fl-cDNA records written to `{1}`.".format( nrecs, idsfile)) f = Fasta(fastafile, lazy=True) newfastafile = prefix + ".clean.fasta" fw = open(newfastafile, "w") nrecs = 0 for name, rec in f.iteritems_ordered(): if name not in keep: continue rec.id = name_convert(name) rec.description = "" SeqIO.write([rec], fw, "fasta") nrecs += 1 fw.close() logging.debug("{0} valid records written to `{1}`.".format( nrecs, newfastafile))
def names(args): """ %prog names namelist templatefile Generate name blocks from the `namelist` file. The `namelist` file is tab-delimited that contains >=4 columns of data. Three columns are mandatory. First name, middle initial and last name. First row is table header. For the extra columns, the first column will go in the `$N0` field in the template file, second to the `$N1` field, etc. In the alternative mode, the namelist just contains several sections. First row will go in the `$N0` in the template file, second to the `$N1` field. The namelist may look like: [Sequence] Bruce A. Roe, Frederic Debelle, Giles Oldroyd, Rene Geurts [Manuscript] Haibao Tang1, Vivek Krishnakumar1, Shelby Bidwell1, Benjamin Rosen1 Then in this example Sequence section goes into N0, Manuscript goes into N1. Useful hints for constructing the template file can be found in: <http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/seq.asn.html> Often the template file can be retrieved from web form: <http://www.ncbi.nlm.nih.gov/WebSub/template.cgi> """ p = OptionParser(names.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) namelist, templatefile = args # First check the alternative format if open(namelist).read()[0] == '[': out = parse_names(namelist) make_template(templatefile, out) return reader = csv.reader(open(namelist), delimiter="\t") header = reader.next() ncols = len(header) assert ncols > 3 nextras = ncols - 3 blocks = [] bools = [] for row in reader: first, middle, last = row[:3] extras = row[3:] bools.append([(x.upper() == 'Y') for x in extras]) middle = middle.strip() if middle != "": middle = middle.rstrip('.') + '.' initials = "{0}.{1}".format(first[0], middle) suffix = "" nameblock = NameTemplate.format(last=last, first=first, initials=initials, suffix=suffix) blocks.append(nameblock) selected_idx = zip(*bools) out = [] * nextras for i, sbools in enumerate(selected_idx): selected = [] for b, ss in zip(blocks, sbools): if ss: selected.append(b) bigblock = ",\n".join(selected) out.append(bigblock) logging.debug("List N{0} contains a total of {1} names.".format( i, len(selected))) make_template(templatefile, out)
def gss(args): """ %prog gss fastafile plateMapping Generate sequence files and metadata templates suited for gss submission. The FASTA file is assumed to be exported from the JCVI data delivery folder which looks like: >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0 /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169 /trace_file_id=1127963805941 /clone_insert_id=1061064364776 /direction=reverse /sequencer_run_id=1064147620155 /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3 /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02 /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs. For example: B906423 SIL-001 """ p = OptionParser(gss.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) fastafile, mappingfile = args seen = defaultdict(int) clone = defaultdict(set) plateMapping = DictFile(mappingfile) fw = open("MetaData.txt", "w") print >> fw, PublicationTemplate.format(**vars) print >> fw, LibraryTemplate.format(**vars) print >> fw, ContactTemplate.format(**vars) logging.debug("Meta data written to `{0}`".format(fw.name)) fw = open("GSS.txt", "w") fw_log = open("GSS.log", "w") for rec in SeqIO.parse(fastafile, "fasta"): # First pass just check well number matchings and populate sequences in # the same clone description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] sequencer_plate_96well_quadrant = \ a["sequencer_plate_96well_quadrant"][0] sequencer_plate_96well_coordinates = \ a["sequencer_plate_96well_coordinates"][0] # Check the 96-well ID is correctly converted to 384-well ID w96 = sequencer_plate_96well_coordinates w96quad = int(sequencer_plate_96well_quadrant) w384 = sequencer_plate_well_coordinates assert convert_96_to_384(w96, w96quad) == w384 plate = sequencer_plate_barcode assert plate in plateMapping, \ "{0} not found in `{1}` !".format(plate, mappingfile) plate = plateMapping[plate] d = Directions[direction] cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) seen[gssID] += 1 if seen[gssID] > 1: gssID = "{0}{1}".format(gssID, seen[gssID]) seen[gssID] += 1 clone[cloneID].add(gssID) seen = defaultdict(int) for rec in SeqIO.parse(fastafile, "fasta"): # need to populate gssID, mateID, cloneID, seq, plate, row, column description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] w384 = sequencer_plate_well_coordinates plate = sequencer_plate_barcode plate = plateMapping[plate] d = Directions[direction] cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) seen[gssID] += 1 if seen[gssID] > 1: logging.error("duplicate key {0} found".format(gssID)) gssID = "{0}{1}".format(gssID, seen[gssID]) othergss = clone[cloneID] - set([gssID]) othergss = ", ".join(sorted(othergss)) vars.update(locals()) print >> fw, GSSTemplate.format(**vars) # Write conversion logs to log file print >> fw_log, "{0}\t{1}".format(gssID, description) print >> fw_log, "=" * 60 logging.debug("A total of {0} seqs written to `{1}`".\ format(len(seen), fw.name)) fw.close() fw_log.close()
def pairinplace(args): """ %prog pairinplace bulk.fastq Pair up the records in bulk.fastq by comparing the names for adjancent records. If they match, print to bulk.pairs.fastq, else print to bulk.frags.fastq. """ from jcvi.utils.iter import pairwise p = OptionParser(pairinplace.__doc__) p.set_rclip() p.set_tag() p.add_option("--base", help="Base name for the output files [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args base = opts.base or op.basename(fastqfile).split(".")[0] frags = base + ".frags.fastq" pairs = base + ".pairs.fastq" if fastqfile.endswith(".gz"): frags += ".gz" pairs += ".gz" fragsfw = must_open(frags, "w") pairsfw = must_open(pairs, "w") N = opts.rclip tag = opts.tag strip_name = (lambda x: x[:-N]) if N else None fh_iter = iter_fastq(fastqfile, key=strip_name) skipflag = False # controls the iterator skip for a, b in pairwise(fh_iter): if b is None: # hit the eof break if skipflag: skipflag = False continue if a.name == b.name: if tag: a.name += "/1" b.name += "/2" print >> pairsfw, a print >> pairsfw, b skipflag = True else: print >> fragsfw, a # don't forget the last one, when b is None if not skipflag: print >> fragsfw, a logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags)) return pairs
def htgnew(args): """ %prog htgnew fastafile phasefile template.sbt Prepare sqnfiles for submitting new Genbank HTG records. `fastafile` contains the sequences. `phasefile` contains the phase information, it is a two column file: mth2-45h12 3 `template.sbt` is the Genbank submission template. This function is simpler than htg, since the record names have not be assigned yet (so less bookkeeping). """ from jcvi.formats.fasta import sequin p = OptionParser(htgnew.__doc__) p.add_option("--comment", default="", help="Comments for this submission [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, phasefile, sbtfile = args comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"' acmd += ' -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' nupdated = 0 for row in open(phasefile): name, phase = row.split()[:2] fafile = op.join(fastadir, name + ".fa") cloneopt = "--clone={0}".format(name) splitfile, gaps = sequin([fafile, cloneopt]) splitfile = op.basename(splitfile) accession = accession_nv = name phase = int(phase) assert phase in (1, 2, 3) cmd = acmd.format(accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, phase=phase, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome using Globus API. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum The downloader will prompt you to enter Phytozome user name and password during downloading. Please register for a login at: https://phytozome.jgi.doe.gov/pz/portal.html. """ from jcvi.apps.biomart import GlobusXMLParser p = OptionParser(phytozome.__doc__) p.add_option( "--version", default="12", choices=("9", "10", "11", "12", "12_unrestricted"), help="Phytozome version", ) p.add_option( "--assembly", default=False, action="store_true", help="Download assembly [default: %default]", ) p.add_option( "--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference", ) opts, args = p.parse_args(args) cookies = get_cookies() directory_listing = ".phytozome_directory_V{}.xml".format(opts.version) # Get directory listing base_url = "http://genome.jgi.doe.gov" dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format( base_url, opts.version) d = download(dlist, filename=directory_listing, cookies=cookies) g = GlobusXMLParser(directory_listing) genomes = g.get_genomes() valid_species = genomes.keys() species_tile = tile(valid_species) p.set_usage("\n".join((phytozome.__doc__, species_tile))) if len(args) != 1: sys.exit(not p.print_help()) species, = args if species == "all": species = ",".join(valid_species) species = species.split(",") for s in species: res = download_species_phytozome(genomes, s, valid_species, base_url, cookies, assembly=opts.assembly) if not res: logging.error("No files downloaded") gff, fa = res.get("gff"), res.get("cds") if opts.format: format_bed_and_cds(s, gff, fa)
def main(): p = OptionParser(__doc__) p.add_option("--switch", help="Rename the seqid with two-column file") p.add_option("--tree", help="Display trees on the bottom of the figure") p.add_option("--extra", help="Extra features in BED format") p.add_option( "--genelabelsize", default=0, type="int", help="Show gene labels at this font size, useful for debugging. " + "However, plot may appear visually crowded. " + "Reasonably good values are 2 to 6 [Default: disabled]", ) p.add_option( "--scalebar", default=False, action="store_true", help="Add scale bar to the plot", ) p.add_option( "--glyphstyle", default="box", choices=Glyph.Styles, help="Style of feature glyphs", ) p.add_option( "--shadestyle", default="curve", choices=Shade.Styles, help="Style of syntenic wedges", ) opts, args, iopts = p.set_image_options(figsize="8x7") if len(args) != 3: sys.exit(not p.print_help()) datafile, bedfile, layoutfile = args switch = opts.switch tree = opts.tree pf = datafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) Synteny( fig, root, datafile, bedfile, layoutfile, switch=switch, tree=tree, extra_features=opts.extra, genelabelsize=opts.genelabelsize, scalebar=opts.scalebar, shadestyle=opts.shadestyle, glyphstyle=opts.glyphstyle, ) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def astat(args): """ %prog astat coverage.log Create coverage-rho scatter plot. """ p = OptionParser(astat.__doc__) p.add_option("--cutoff", default=1000, type="int", help="Length cutoff [default: %default]") p.add_option("--genome", default="", help="Genome name [default: %default]") p.add_option("--arrDist", default=False, action="store_true", help="Use arrDist instead [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) covfile, = args cutoff = opts.cutoff genome = opts.genome plot_arrDist = opts.arrDist suffix = ".{0}".format(cutoff) small_covfile = covfile + suffix update_covfile = need_update(covfile, small_covfile) if update_covfile: fw = open(small_covfile, "w") else: logging.debug("Found `{0}`, will use this one".format(small_covfile)) covfile = small_covfile fp = open(covfile) header = fp.next() if update_covfile: fw.write(header) data = [] msg = "{0} tigs scanned ..." for row in fp: tigID, rho, covStat, arrDist = row.split() tigID = int(tigID) if tigID % 1000000 == 0: sys.stderr.write(msg.format(tigID) + "\r") rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)] if rho < cutoff: continue if update_covfile: fw.write(row) data.append((tigID, rho, covStat, arrDist)) print >> sys.stderr, msg.format(tigID) from jcvi.graphics.base import plt, savefig logging.debug("Plotting {0} data points.".format(len(data))) tigID, rho, covStat, arrDist = zip(*data) y = arrDist if plot_arrDist else covStat ytag = "arrDist" if plot_arrDist else "covStat" fig = plt.figure(1, (7, 7)) ax = fig.add_axes([.12, .1, .8, .8]) ax.plot(rho, y, ".", color="lightslategrey") xtag = "rho" info = (genome, xtag, ytag) title = "{0} {1} vs. {2}".format(*info) ax.set_title(title) ax.set_xlabel(xtag) ax.set_ylabel(ytag) if plot_arrDist: ax.set_yscale('log') imagename = "{0}.png".format(".".join(info)) savefig(imagename, dpi=150)
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format [default: %default]", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database [default: %default]", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return [default: %default]", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert (database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: print( "A total of {0} {1} records downloaded.".format( totalsize, fmt.upper()), file=sys.stderr, ) return outfile
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option("--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1. / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile
def wgsim(args): """ %prog wgsim fastafile Run dwgsim on fastafile. """ p = OptionParser(wgsim.__doc__) p.add_option("--erate", default=.02, type="float", help="Base error rate of the read [default: %default]") p.add_option( "--distance", default=500, type="int", help="Outer distance between the two ends [default: %default]") p.add_option("--genomesize", type="int", help="Genome size in Mb [default: estimate from data]") p.add_option("--readlen", default=100, type="int", help="Length of the read [default: %default]") p.add_option("--noerrors", default=False, action="store_true", help="Simulate reads with no errors [default: %default]") p.set_depth(depth=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args pf = fastafile.split(".")[0] genomesize = opts.genomesize size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize depth = opts.depth readlen = opts.readlen readnum = size * depth / (2 * readlen) distance = opts.distance stdev = distance / 5 outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth) distance -= 2 * readlen # Outer distance => Inner distance assert distance >= 0, "Outer distance must be >= 2 * readlen" logging.debug("Total genome size: {0} bp".format(size)) logging.debug("Target depth: {0}x".format(depth)) logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum)) if opts.noerrors: opts.erate = 0 cmd = "dwgsim -e {0} -E {0}".format(opts.erate) if opts.noerrors: cmd += " -r 0 -R 0 -X 0 -y 0" cmd += " -d {0} -s {1}".format(distance, stdev) cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen) cmd += " {0} {1}".format(fastafile, outpf) sh(cmd)
def simulate(args): """ %prog simulate run_dir 1 300 Simulate BAMs with varying inserts with dwgsim. The above command will simulate between 1 to 300 CAGs in the HD region, in a directory called `run_dir`. """ p = OptionParser(simulate.__doc__) p.add_option("--ref", default="/Users/htang/projects/ref/hg38.upper.fa", help="Reference genome sequence") add_simulate_options(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) rundir, startunits, endunits = args startunits, endunits = int(startunits), int(endunits) basecwd = os.getcwd() mkdir(rundir) os.chdir(rundir) cwd = os.getcwd() # Huntington region pad_left, pad_right = 1000, 10000 chr, start, end = 'chr4', 3074877, 3074933 fasta = Fasta(opts.ref) seq_left = fasta[chr][start - pad_left:start - 1] seq_right = fasta[chr][end:end + pad_right] motif = 'CAG' reffastafile = "ref.fasta" seq = str(fasta[chr][start - pad_left:end + pad_right]) make_fasta(seq, reffastafile, id=chr.upper()) # Write fake sequence for units in range(startunits, endunits + 1): pf = str(units) mkdir(pf) os.chdir(pf) seq = str(seq_left) + motif * units + str(seq_right) fastafile = pf + ".fasta" make_fasta(seq, fastafile, id=chr.upper()) # Simulate reads on it wgsim([ fastafile, "--depth={}".format(opts.depth), "--readlen={}".format(opts.readlen), "--distance={}".format(opts.distance), "--outfile={}".format(pf) ]) read1 = pf + ".bwa.read1.fastq" read2 = pf + ".bwa.read2.fastq" samfile, _ = align(["../{}".format(reffastafile), read1, read2]) indexed_samfile = index([samfile]) sh("mv {} ../{}.bam".format(indexed_samfile, pf)) sh("mv {}.bai ../{}.bam.bai".format(indexed_samfile, pf)) os.chdir(cwd) shutil.rmtree(pf) os.chdir(basecwd)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq") p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=0, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option("--readname", default=False, action="store_true", help="Keep read name (e.g. long Pacbio name)") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\ format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate[:2].upper() frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print >> fw, headerTemplate.format(libID=libname) sequential = not opts.readname f = Fasta(fastafile, lazy=True) i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".\ format(i, frgfile, j))
def diagram(args): """ %prog diagram Plot the predictive power of various evidences. """ p = OptionParser(diagram.__doc__) opts, args, iopts = p.set_image_options(args, figsize="8x4") if len(args) != 0: sys.exit(not p.print_help()) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Gauge on top, this is log-scale yy = .7 yinterval = .1 height = .05 yp = yy - yinterval - height canvas = .95 xstart = .025 convert = lambda x: xstart + x * canvas / 1200 # Symbols root.text(.5, .9, r"$L$: Read length, $F$: Flank size, $V$: Pair distance", ha="center") root.text(.5, .85, r"ex. $L=150bp, F=9bp, V=500bp$", ha="center") root.text(xstart + canvas, yy - height, "STR repeat length", ha="center", color=lsg, size=10) # Mark the key events pad = .02 arrowlen = canvas * 1.05 arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0, head_length=arrowlen * .12, head_width=.04) p = FancyArrow(xstart, yy, arrowlen, 0, shape="right", **arrowprops) root.add_patch(p) ppad = 30 keyevents = ( (0, 0, -1, r"$0$"), (150 - 18, 150 - 18 - ppad, 0, r"$L - 2F$"), (150 - 9, 150 - 9, 1, r"$L - F$"), (150, 150 + ppad, 2, r"$L$"), (500 - 9, 500 - 9, 3, r"$V - F$"), (500 * 2 - 18, 500 * 2 - 18, 2, r"$2(V - F)$"), ) for event, pos, i, label in keyevents: _event = convert(event) _pos = convert(pos) root.plot((_event, _event), (yy - height / 4, yy + height / 4), '-', color='k') root.text(_pos, yy + pad, label, rotation=45, va="bottom", size=8) if i < 0: continue ystart = yp - i * yinterval root.plot((_event, _event), (ystart, yy - height / 4), ':', color=lsg) # Range on bottom. These are simple 4 rectangles, with the range indicating # the predictive range. CLOSED, OPEN = range(2) ranges = ( (0, 150 - 18, CLOSED, "Spanning reads"), (9, 150 - 9, OPEN, "Partial reads"), (150, 500 * 2 - 18, CLOSED, "Repeat reads"), (0, 500 - 9, CLOSED, "Paired-end reads"), ) for start, end, starttag, label in ranges: _start = convert(start) _end = convert(end) data = [[0., 1.], [0., 1.]] if starttag == OPEN else \ [[1., 0.], [1., 0.]] root.imshow(data, interpolation='bicubic', cmap=plt.cm.Greens, extent=[_start, _end, yp, yp + height]) root.text(_end + pad, yp + height / 2, label, va="center") yp -= yinterval normalize_axes(root) image_name = "diagram." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def evidences(args): """ %prog evidences Plot distribution of evidences against two factors: - Sample mean coverage - Longer allele """ p = OptionParser(evidences.__doc__) p.add_option("--csv", default="hli.20170328.tred.tsv", help="TRED csv output to plot") opts, args, iopts = p.set_image_options(args, format="pdf") if len(args) != 0: sys.exit(not p.print_help()) format = iopts.format # Extract sample coverage first df = pd.read_csv("qc-export-MeanCoverage.csv", header=None, names=["Samplekey", "MeanCoverage"], index_col=0) # Find coverage for HD xf = pd.read_csv(opts.csv, sep="\t", index_col=0) dp = {} tred = "HD" for sk, row in xf.iterrows(): sk = str(sk) a1 = row[tred + ".1"] a2 = row[tred + ".2"] fdp = row[tred + ".FDP"] pdp = row[tred + ".PDP"] pedp = row[tred + ".PEDP"] dp[sk] = (a1, a2, fdp, pdp, pedp) # Build a consolidated dataframe ef = pd.DataFrame.from_dict(dp, orient="index") ef.columns = [ tred + ".1", tred + ".2", tred + ".FDP", tred + ".PDP", tred + ".PEDP" ] ef.index.name = "SampleKey" mf = df.merge(ef, how="right", left_index=True, right_index=True) # Plot a bunch of figures outdir = "output" mkdir(outdir) xlim = ylim = (0, 100) draw_jointplot(outdir + "/A", "MeanCoverage", "HD.FDP", data=mf, xlim=xlim, ylim=ylim, format=format) draw_jointplot(outdir + "/B", "MeanCoverage", "HD.PDP", data=mf, color='g', xlim=xlim, ylim=ylim, format=format) draw_jointplot(outdir + "/C", "MeanCoverage", "HD.PEDP", data=mf, color='m', xlim=xlim, ylim=ylim, format=format) xlim = (0, 50) draw_jointplot(outdir + "/D", "HD.2", "HD.FDP", data=mf, xlim=xlim, ylim=ylim, format=format) draw_jointplot(outdir + "/E", "HD.2", "HD.PDP", data=mf, color='g', xlim=xlim, ylim=ylim, format=format) draw_jointplot(outdir + "/F", "HD.2", "HD.PEDP", data=mf, color='m', xlim=xlim, ylim=ylim, format=format)