Пример #1
0
def draw_cytoband(ax,
                  chrom,
                  filename=datafile("hg38.band.txt"),
                  ymid=0.5,
                  width=0.99,
                  height=0.11):
    import pandas as pd

    bands = pd.read_csv(filename, sep="\t")
    chrombands = bands[bands["#chrom"] == chrom]
    data = []
    for i, (chr, start, end, name, gie) in chrombands.iterrows():
        data.append((chr, start, end, name, gie))
    chromsize = max(x[2] for x in data)
    scale = width * 1.0 / chromsize
    xstart, ystart = (1 - width) / 2, ymid - height / 2
    bp_to_pos = lambda x: xstart + x * scale
    in_acen = False
    for chr, start, end, name, gie in data:
        color, alpha = get_color(gie)
        bplen = end - start
        if "acen" in gie:
            if in_acen:
                xys = [
                    (bp_to_pos(start), ymid),
                    (bp_to_pos(end), ystart),
                    (bp_to_pos(end), ystart + height),
                ]
            else:
                xys = [
                    (bp_to_pos(start), ystart),
                    (bp_to_pos(start), ystart + height),
                    (bp_to_pos(end), ymid),
                ]
            p = Polygon(xys, closed=True, ec="k", fc=color, alpha=alpha)
            in_acen = True
        else:
            p = Rectangle(
                (bp_to_pos(start), ystart),
                bplen * scale,
                height,
                ec="k",
                fc=color,
                alpha=alpha,
            )
        # print bp_to_pos(end)
        ax.add_patch(p)
        ax.text(
            bp_to_pos((start + end) / 2),
            ymid + height * 0.8,
            name,
            rotation=40,
            color="lightslategray",
        )

    ax.text(0.5, ystart - height, chrom, size=16, ha="center", va="center")

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_axis_off()
Пример #2
0
def treds(args):
    """
    %prog treds hli.tred.tsv

    Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and
    mask.tsv in one go.
    """
    from jcvi.apps.base import datafile

    p = OptionParser(treds.__doc__)
    p.add_option("--csv",
                 default=False,
                 action="store_true",
                 help="Also write `meta.csv`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (tredresults, ) = args
    df = pd.read_csv(tredresults, sep="\t")

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    ids = list(tf["id"])
    tags = ["SampleKey"]
    final_columns = ["SampleKey"]
    afs = []
    for td, id in zip(tds, ids):
        tag1 = "{}.1".format(td)
        tag2 = "{}.2".format(td)
        if tag2 not in df:
            afs.append("{}")
            continue
        tags.append(tag2)
        final_columns.append(id)
        a = np.array(list(df[tag1]) + list(df[tag2]))
        counts = alleles_to_counts(a)
        af = counts_to_af(counts)
        afs.append(af)

    tf["allele_frequency"] = afs

    metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
    tf.to_csv(metafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(metafile))
    if opts.csv:
        metacsvfile = metafile.rsplit(".", 1)[0] + ".csv"
        tf.to_csv(metacsvfile, index=False)
        logging.debug("File `{}` written.".format(metacsvfile))

    pp = df[tags]
    pp.columns = final_columns
    datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
    pp.to_csv(datafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(datafile))

    mask([datafile, metafile])
Пример #3
0
def locus(args):
    """
    %prog locus bamfile

    Extract selected locus from a list of TREDs for validation, and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam

    # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation
    INCLUDE = [
        "HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS"
    ]
    db_choices = ("hg38", "hg19")

    p = OptionParser(locus.__doc__)
    p.add_option("--tred", choices=INCLUDE, help="TRED name")
    p.add_option("--ref",
                 choices=db_choices,
                 default="hg38",
                 help="Reference genome")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bamfile, ) = args
    ref = opts.ref
    lhome = opts.lobstr_home
    tred = opts.tred

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile, index_col=0)
    row = tf.ix[tred]
    tag = "repeat_location"
    ldb = "TREDs"
    if ref == "hg19":
        tag += "." + ref
        ldb += "-" + ref
    seqid, start_end = row[tag].split(":")

    PAD = 1000
    start, end = start_end.split("-")
    start, end = int(start) - PAD, int(end) + PAD
    region = "{}:{}-{}".format(seqid, start, end)

    minibamfile = get_minibam(bamfile, region)
    c = seqid.replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb)
    sh(cmd)

    parser = LobSTRvcf(columnidsfile=None)
    parser.parse(vcf, filtered=False)
    items = parser.items()
    if not items:
        print("No entry found!", file=sys.stderr)
        return

    k, v = parser.items()[0]
    print("{} => {}".format(tred, v.replace(",", "/")), file=sys.stderr)
Пример #4
0
def locus(args):
    """
    %prog locus bamfile

    Extract selected locus from a list of TREDs for validation, and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam
    # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation
    INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2",
               "FXTAS"]
    db_choices = ("hg38", "hg19")

    p = OptionParser(locus.__doc__)
    p.add_option("--tred", choices=INCLUDE,
                 help="TRED name")
    p.add_option("--ref", choices=db_choices, default="hg38",
                 help="Reference genome")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    ref = opts.ref
    lhome = opts.lobstr_home
    tred = opts.tred

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile, index_col=0)
    row = tf.ix[tred]
    tag = "repeat_location"
    ldb = "TREDs"
    if ref == "hg19":
        tag += "." + ref
        ldb += "-" + ref
    seqid, start_end = row[tag].split(":")

    PAD = 1000
    start, end = start_end.split('-')
    start, end = int(start) - PAD, int(end) + PAD
    region = "{}:{}-{}".format(seqid, start, end)

    minibamfile = get_minibam(bamfile, region)
    c = seqid.replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb)
    sh(cmd)

    parser = LobSTRvcf(columnidsfile=None)
    parser.parse(vcf, filtered=False)
    items = parser.items()
    if not items:
        print("No entry found!", file=sys.stderr)
        return

    k, v = parser.items()[0]
    print("{} => {}".format(tred, v.replace(',', '/')), file=sys.stderr)
Пример #5
0
def treds(args):
    """
    %prog treds hli.tred.tsv

    Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and
    mask.tsv in one go.
    """
    p = OptionParser(treds.__doc__)
    p.add_option("--csv", default=False, action="store_true",
                 help="Also write `meta.csv`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    tredresults, = args
    df = pd.read_csv(tredresults, sep="\t")

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    ids = list(tf["id"])
    tags = ["SampleKey"]
    final_columns = ["SampleKey"]
    afs = []
    for td, id in zip(tds, ids):
        tag1 = "{}.1".format(td)
        tag2 = "{}.2".format(td)
        if tag2 not in df:
            afs.append("{}")
            continue
        tags.append(tag2)
        final_columns.append(id)
        a = np.array(list(df[tag1]) + list(df[tag2]))
        counts = alleles_to_counts(a)
        af = counts_to_af(counts)
        afs.append(af)

    tf["allele_frequency"] = afs

    metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
    tf.to_csv(metafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(metafile))
    if opts.csv:
        metacsvfile = metafile.rsplit(".", 1)[0] + ".csv"
        tf.to_csv(metacsvfile, index=False)
        logging.debug("File `{}` written.".format(metacsvfile))

    pp = df[tags]
    pp.columns = final_columns
    datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
    pp.to_csv(datafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(datafile))

    mask([datafile, metafile])
Пример #6
0
def read_treds(tredsfile=datafile("TREDs.meta.csv")):
    if tredsfile.endswith(".csv"):
        df = pd.read_csv(tredsfile)
        treds = set(df["id"])
    else:
        df = pd.read_csv(tredsfile, sep="\t")
        treds = set(df["abbreviation"])

    logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile))
    return treds, df
Пример #7
0
def read_treds(tredsfile=datafile("TREDs.meta.csv")):
    if tredsfile.endswith(".csv"):
        df = pd.read_csv(tredsfile)
        treds = set(df["id"])
    else:
        df = pd.read_csv(tredsfile, sep="\t")
        treds = set(df["abbreviation"])

    logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile))
    return treds, df
Пример #8
0
def make_STR_bed(filename="STR.bed", pad=0, treds=None):
    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    regions = list(tf["repeat_location"])
    fw = must_open(filename, "w")
    extract_Y = False
    for td, region in zip(tds, regions):
        if treds and (td not in treds):
            continue
        c, startend = region.split(":")
        extract_Y = extract_Y or (c == "chrY")
        start, end = startend.split("-")
        start, end = int(start), int(end)
        print >> fw, "\t".join(str(x) for x in (c, start - pad, end + pad, td))

    if not extract_Y:
        return filename

    UNIQY = datafile("chrY.hg38.unique_ccn.gc")
    fp = open(UNIQY)
    nregions = 0
    for i, row in enumerate(fp):
        # Some regions still have mapped reads, exclude a few
        if i in (1, 4, 6, 7, 10, 11, 13, 16, 18, 19):
            continue
        if nregions >= 5:
            break
        c, start, end, gc = row.split()
        start, end = int(start), int(end)
        print >> fw, "\t".join(
            str(x) for x in (c, start - pad, end + pad,
                             "chrY.unique_ccn.{}".format(nregions)))
        nregions += 1

    fw.close()
    return filename
Пример #9
0
def draw_cytoband(ax, chrom, filename=datafile("hg38.band.txt"),
                  ymid=.5, width=.99, height=.11):
    import pandas as pd

    bands = pd.read_csv(filename, sep="\t")
    chrombands = bands[bands["#chrom"] == chrom]
    data = []
    for i, (chr, start, end, name, gie) in chrombands.iterrows():
        data.append((chr, start, end, name, gie))
    chromsize = max(x[2] for x in data)
    scale = width * 1. / chromsize
    xstart, ystart = (1 - width) / 2, ymid - height / 2
    bp_to_pos = lambda x: xstart + x * scale
    in_acen = False
    for chr, start, end, name, gie in data:
        color, alpha = get_color(gie)
        bplen = end - start
        if "acen" in gie:
            if in_acen:
                xys = [(bp_to_pos(start), ymid), (bp_to_pos(end), ystart), (bp_to_pos(end), ystart + height)]
            else:
                xys = [(bp_to_pos(start), ystart), (bp_to_pos(start), ystart + height), (bp_to_pos(end), ymid)]
            p = Polygon(xys, closed=True, ec='k', fc=color, alpha=alpha)
            in_acen = True
        else:
            p = Rectangle((bp_to_pos(start), ystart), bplen * scale, height,
                              ec='k', fc=color, alpha=alpha)
        #print bp_to_pos(end)
        ax.add_patch(p)
        ax.text(bp_to_pos((start + end) / 2), ymid + height * .8, name, rotation=40, color="lightslategray")

    ax.text(.5, ystart - height, chrom, size=16, ha="center", va="center")

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_axis_off()
Пример #10
0
 def __init__(self, filename=datafile("instance.json")):
     super(InstanceSkeleton, self).__init__(filename)
     self.spec = json.load(open(filename))
Пример #11
0
from math import log, ceil
from collections import Counter, defaultdict
from multiprocessing import Pool

from jcvi.utils.cbook import percentage, uniqify
from jcvi.formats.base import timestamp
from jcvi.formats.bed import natsorted
from jcvi.apps.grid import MakeManager
from jcvi.formats.base import LineFile, must_open
from jcvi.utils.aws import push_to_s3, pull_from_s3, check_exists_s3, ls_s3
from jcvi.apps.base import OptionParser, ActionDispatcher, mkdir, need_update, \
            datafile, sh

REF = "hg38"
REPO = datafile("TREDs.meta.csv")

READLEN = 150
MINSCORE = 36
YSEARCH_HAPLOTYPE = """
DYS393  DYS390 DYS19/DYS394  DYS19b        DYS391        DYS385a       DYS385b DYS426  DYS388  DYS439
DYS389I DYS392 DYS389B       DYS458        DYS459a/b     DYS459a/b     DYS455  DYS454  DYS447  DYS437
DYS448  DYS449 DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464e DYS464f DYS464g DYS460
GATA-H4 YCAIIa YCAIIb        DYS456        DYS607        DYS576        DYS570  CDYa    CDYb    DYS442
DYS438  DYS531 DYS578        DYS395S1a/b   DYS395S1a/b   DYS590        DYS537  DYS641  DYS472  DYS406S1
DYS511  DYS425 DYS413a       DYS413b       DYS557        DYS594        DYS436  DYS490  DYS534  DYS450
DYS444  DYS481 DYS520        DYS446        DYS617        DYS568        DYS487  DYS572  DYS640  DYS492
DYS565  DYS461 DYS462        GATA-A10      DYS635        GAAT1B07      DYS441  DYS445  DYS452  DYS463
DYS434  DYS435 DYS485        DYS494        DYS495        DYS505        DYS522  DYS533  DYS549  DYS556
DYS575  DYS589 DYS636        DYS638        DYS643        DYS714        DYS716  DYS717  DYS726  DXYS156-Y
""".split()
Пример #12
0
from math import log, ceil
from collections import Counter, defaultdict
from multiprocessing import Pool

from jcvi.utils.cbook import percentage, uniqify
from jcvi.formats.base import timestamp
from jcvi.formats.bed import natsorted
from jcvi.apps.grid import MakeManager
from jcvi.formats.base import LineFile, must_open
from jcvi.utils.aws import push_to_s3, pull_from_s3, check_exists_s3, ls_s3
from jcvi.apps.base import OptionParser, ActionDispatcher, mkdir, need_update, \
            datafile, sh


REF = "hg38"
REPO = datafile("TREDs.meta.csv")

READLEN = 150
MINSCORE = 36
YSEARCH_HAPLOTYPE = """
DYS393  DYS390 DYS19/DYS394  DYS19b        DYS391        DYS385a       DYS385b DYS426  DYS388  DYS439
DYS389I DYS392 DYS389B       DYS458        DYS459a/b     DYS459a/b     DYS455  DYS454  DYS447  DYS437
DYS448  DYS449 DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464a/b/c/d DYS464e DYS464f DYS464g DYS460
GATA-H4 YCAIIa YCAIIb        DYS456        DYS607        DYS576        DYS570  CDYa    CDYb    DYS442
DYS438  DYS531 DYS578        DYS395S1a/b   DYS395S1a/b   DYS590        DYS537  DYS641  DYS472  DYS406S1
DYS511  DYS425 DYS413a       DYS413b       DYS557        DYS594        DYS436  DYS490  DYS534  DYS450
DYS444  DYS481 DYS520        DYS446        DYS617        DYS568        DYS487  DYS572  DYS640  DYS492
DYS565  DYS461 DYS462        GATA-A10      DYS635        GAAT1B07      DYS441  DYS445  DYS452  DYS463
DYS434  DYS435 DYS485        DYS494        DYS495        DYS505        DYS522  DYS533  DYS549  DYS556
DYS575  DYS589 DYS636        DYS638        DYS643        DYS714        DYS716  DYS717  DYS726  DXYS156-Y
""".split()
Пример #13
0
 def __init__(self, filename=datafile("instance.json")):
     super(InstanceSkeleton, self).__init__(filename)
     self.spec = json.load(open(filename))
Пример #14
0
def get_hg38_chromsizes(filename=datafile("hg38.chrom.sizes")):
    chromsizes = DictFile(filename)
    chromsizes = dict((k, int(v)) for k, v in chromsizes.items())
    return chromsizes
Пример #15
0
def read_treds(tredsfile=datafile("TREDs.meta.csv")):
    df = pd.read_csv(tredsfile)
    treds = set(df["id"])
    logging.debug("Loaded {} treds from `{}`".format(len(treds), tredsfile))
    return treds, df