def draw_cytoband(ax, chrom, filename=datafile("hg38.band.txt"), ymid=0.5, width=0.99, height=0.11): import pandas as pd bands = pd.read_csv(filename, sep="\t") chrombands = bands[bands["#chrom"] == chrom] data = [] for i, (chr, start, end, name, gie) in chrombands.iterrows(): data.append((chr, start, end, name, gie)) chromsize = max(x[2] for x in data) scale = width * 1.0 / chromsize xstart, ystart = (1 - width) / 2, ymid - height / 2 bp_to_pos = lambda x: xstart + x * scale in_acen = False for chr, start, end, name, gie in data: color, alpha = get_color(gie) bplen = end - start if "acen" in gie: if in_acen: xys = [ (bp_to_pos(start), ymid), (bp_to_pos(end), ystart), (bp_to_pos(end), ystart + height), ] else: xys = [ (bp_to_pos(start), ystart), (bp_to_pos(start), ystart + height), (bp_to_pos(end), ymid), ] p = Polygon(xys, closed=True, ec="k", fc=color, alpha=alpha) in_acen = True else: p = Rectangle( (bp_to_pos(start), ystart), bplen * scale, height, ec="k", fc=color, alpha=alpha, ) # print bp_to_pos(end) ax.add_patch(p) ax.text( bp_to_pos((start + end) / 2), ymid + height * 0.8, name, rotation=40, color="lightslategray", ) ax.text(0.5, ystart - height, chrom, size=16, ha="center", va="center") ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.set_axis_off()
def treds(args): """ %prog treds hli.tred.tsv Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and mask.tsv in one go. """ from jcvi.apps.base import datafile p = OptionParser(treds.__doc__) p.add_option("--csv", default=False, action="store_true", help="Also write `meta.csv`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (tredresults, ) = args df = pd.read_csv(tredresults, sep="\t") tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) ids = list(tf["id"]) tags = ["SampleKey"] final_columns = ["SampleKey"] afs = [] for td, id in zip(tds, ids): tag1 = "{}.1".format(td) tag2 = "{}.2".format(td) if tag2 not in df: afs.append("{}") continue tags.append(tag2) final_columns.append(id) a = np.array(list(df[tag1]) + list(df[tag2])) counts = alleles_to_counts(a) af = counts_to_af(counts) afs.append(af) tf["allele_frequency"] = afs metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp()) tf.to_csv(metafile, sep="\t", index=False) logging.debug("File `{}` written.".format(metafile)) if opts.csv: metacsvfile = metafile.rsplit(".", 1)[0] + ".csv" tf.to_csv(metacsvfile, index=False) logging.debug("File `{}` written.".format(metacsvfile)) pp = df[tags] pp.columns = final_columns datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp()) pp.to_csv(datafile, sep="\t", index=False) logging.debug("File `{}` written.".format(datafile)) mask([datafile, metafile])
def locus(args): """ %prog locus bamfile Extract selected locus from a list of TREDs for validation, and run lobSTR. """ from jcvi.formats.sam import get_minibam # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation INCLUDE = [ "HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS" ] db_choices = ("hg38", "hg19") p = OptionParser(locus.__doc__) p.add_option("--tred", choices=INCLUDE, help="TRED name") p.add_option("--ref", choices=db_choices, default="hg38", help="Reference genome") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bamfile, ) = args ref = opts.ref lhome = opts.lobstr_home tred = opts.tred tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile, index_col=0) row = tf.ix[tred] tag = "repeat_location" ldb = "TREDs" if ref == "hg19": tag += "." + ref ldb += "-" + ref seqid, start_end = row[tag].split(":") PAD = 1000 start, end = start_end.split("-") start, end = int(start) - PAD, int(end) + PAD region = "{}:{}-{}".format(seqid, start, end) minibamfile = get_minibam(bamfile, region) c = seqid.replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb) sh(cmd) parser = LobSTRvcf(columnidsfile=None) parser.parse(vcf, filtered=False) items = parser.items() if not items: print("No entry found!", file=sys.stderr) return k, v = parser.items()[0] print("{} => {}".format(tred, v.replace(",", "/")), file=sys.stderr)
def locus(args): """ %prog locus bamfile Extract selected locus from a list of TREDs for validation, and run lobSTR. """ from jcvi.formats.sam import get_minibam # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS"] db_choices = ("hg38", "hg19") p = OptionParser(locus.__doc__) p.add_option("--tred", choices=INCLUDE, help="TRED name") p.add_option("--ref", choices=db_choices, default="hg38", help="Reference genome") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args ref = opts.ref lhome = opts.lobstr_home tred = opts.tred tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile, index_col=0) row = tf.ix[tred] tag = "repeat_location" ldb = "TREDs" if ref == "hg19": tag += "." + ref ldb += "-" + ref seqid, start_end = row[tag].split(":") PAD = 1000 start, end = start_end.split('-') start, end = int(start) - PAD, int(end) + PAD region = "{}:{}-{}".format(seqid, start, end) minibamfile = get_minibam(bamfile, region) c = seqid.replace("chr", "") cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb) sh(cmd) parser = LobSTRvcf(columnidsfile=None) parser.parse(vcf, filtered=False) items = parser.items() if not items: print("No entry found!", file=sys.stderr) return k, v = parser.items()[0] print("{} => {}".format(tred, v.replace(',', '/')), file=sys.stderr)
def treds(args): """ %prog treds hli.tred.tsv Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and mask.tsv in one go. """ p = OptionParser(treds.__doc__) p.add_option("--csv", default=False, action="store_true", help="Also write `meta.csv`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) tredresults, = args df = pd.read_csv(tredresults, sep="\t") tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) ids = list(tf["id"]) tags = ["SampleKey"] final_columns = ["SampleKey"] afs = [] for td, id in zip(tds, ids): tag1 = "{}.1".format(td) tag2 = "{}.2".format(td) if tag2 not in df: afs.append("{}") continue tags.append(tag2) final_columns.append(id) a = np.array(list(df[tag1]) + list(df[tag2])) counts = alleles_to_counts(a) af = counts_to_af(counts) afs.append(af) tf["allele_frequency"] = afs metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp()) tf.to_csv(metafile, sep="\t", index=False) logging.debug("File `{}` written.".format(metafile)) if opts.csv: metacsvfile = metafile.rsplit(".", 1)[0] + ".csv" tf.to_csv(metacsvfile, index=False) logging.debug("File `{}` written.".format(metacsvfile)) pp = df[tags] pp.columns = final_columns datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp()) pp.to_csv(datafile, sep="\t", index=False) logging.debug("File `{}` written.".format(datafile)) mask([datafile, metafile])
def read_treds(tredsfile=datafile("TREDs.meta.csv")): if tredsfile.endswith(".csv"): df = pd.read_csv(tredsfile) treds = set(df["id"]) else: df = pd.read_csv(tredsfile, sep="\t") treds = set(df["abbreviation"]) logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile)) return treds, df
def make_STR_bed(filename="STR.bed", pad=0, treds=None): tredsfile = datafile("TREDs.meta.csv") tf = pd.read_csv(tredsfile) tds = list(tf["abbreviation"]) regions = list(tf["repeat_location"]) fw = must_open(filename, "w") extract_Y = False for td, region in zip(tds, regions): if treds and (td not in treds): continue c, startend = region.split(":") extract_Y = extract_Y or (c == "chrY") start, end = startend.split("-") start, end = int(start), int(end) print >> fw, "\t".join(str(x) for x in (c, start - pad, end + pad, td)) if not extract_Y: return filename UNIQY = datafile("chrY.hg38.unique_ccn.gc") fp = open(UNIQY) nregions = 0 for i, row in enumerate(fp): # Some regions still have mapped reads, exclude a few if i in (1, 4, 6, 7, 10, 11, 13, 16, 18, 19): continue if nregions >= 5: break c, start, end, gc = row.split() start, end = int(start), int(end) print >> fw, "\t".join( str(x) for x in (c, start - pad, end + pad, "chrY.unique_ccn.{}".format(nregions))) nregions += 1 fw.close() return filename
def draw_cytoband(ax, chrom, filename=datafile("hg38.band.txt"), ymid=.5, width=.99, height=.11): import pandas as pd bands = pd.read_csv(filename, sep="\t") chrombands = bands[bands["#chrom"] == chrom] data = [] for i, (chr, start, end, name, gie) in chrombands.iterrows(): data.append((chr, start, end, name, gie)) chromsize = max(x[2] for x in data) scale = width * 1. / chromsize xstart, ystart = (1 - width) / 2, ymid - height / 2 bp_to_pos = lambda x: xstart + x * scale in_acen = False for chr, start, end, name, gie in data: color, alpha = get_color(gie) bplen = end - start if "acen" in gie: if in_acen: xys = [(bp_to_pos(start), ymid), (bp_to_pos(end), ystart), (bp_to_pos(end), ystart + height)] else: xys = [(bp_to_pos(start), ystart), (bp_to_pos(start), ystart + height), (bp_to_pos(end), ymid)] p = Polygon(xys, closed=True, ec='k', fc=color, alpha=alpha) in_acen = True else: p = Rectangle((bp_to_pos(start), ystart), bplen * scale, height, ec='k', fc=color, alpha=alpha) #print bp_to_pos(end) ax.add_patch(p) ax.text(bp_to_pos((start + end) / 2), ymid + height * .8, name, rotation=40, color="lightslategray") ax.text(.5, ystart - height, chrom, size=16, ha="center", va="center") ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.set_axis_off()
def __init__(self, filename=datafile("instance.json")): super(InstanceSkeleton, self).__init__(filename) self.spec = json.load(open(filename))
from math import log, ceil from collections import Counter, defaultdict from multiprocessing import Pool from jcvi.utils.cbook import percentage, uniqify from jcvi.formats.base import timestamp from jcvi.formats.bed import natsorted from jcvi.apps.grid import MakeManager from jcvi.formats.base import LineFile, must_open from jcvi.utils.aws import push_to_s3, pull_from_s3, check_exists_s3, ls_s3 from jcvi.apps.base import OptionParser, ActionDispatcher, mkdir, need_update, \ datafile, sh REF = "hg38" REPO = datafile("TREDs.meta.csv") READLEN = 150 MINSCORE = 36 YSEARCH_HAPLOTYPE = """ DYS393 DYS390 DYS19/DYS394 DYS19b DYS391 DYS385a DYS385b DYS426 DYS388 DYS439 DYS389I DYS392 DYS389B DYS458 DYS459a/b DYS459a/b DYS455 DYS454 DYS447 DYS437 DYS448 DYS449 DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464e DYS464f DYS464g DYS460 GATA-H4 YCAIIa YCAIIb DYS456 DYS607 DYS576 DYS570 CDYa CDYb DYS442 DYS438 DYS531 DYS578 DYS395S1a/b DYS395S1a/b DYS590 DYS537 DYS641 DYS472 DYS406S1 DYS511 DYS425 DYS413a DYS413b DYS557 DYS594 DYS436 DYS490 DYS534 DYS450 DYS444 DYS481 DYS520 DYS446 DYS617 DYS568 DYS487 DYS572 DYS640 DYS492 DYS565 DYS461 DYS462 GATA-A10 DYS635 GAAT1B07 DYS441 DYS445 DYS452 DYS463 DYS434 DYS435 DYS485 DYS494 DYS495 DYS505 DYS522 DYS533 DYS549 DYS556 DYS575 DYS589 DYS636 DYS638 DYS643 DYS714 DYS716 DYS717 DYS726 DXYS156-Y """.split()
def get_hg38_chromsizes(filename=datafile("hg38.chrom.sizes")): chromsizes = DictFile(filename) chromsizes = dict((k, int(v)) for k, v in chromsizes.items()) return chromsizes
def read_treds(tredsfile=datafile("TREDs.meta.csv")): df = pd.read_csv(tredsfile) treds = set(df["id"]) logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile)) return treds, df