Пример #1
0
def treds(args):
    """
    %prog treds hli.tred.tsv

    Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and
    mask.tsv in one go.
    """
    from jcvi.apps.base import datafile

    p = OptionParser(treds.__doc__)
    p.add_option("--csv",
                 default=False,
                 action="store_true",
                 help="Also write `meta.csv`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (tredresults, ) = args
    df = pd.read_csv(tredresults, sep="\t")

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    ids = list(tf["id"])
    tags = ["SampleKey"]
    final_columns = ["SampleKey"]
    afs = []
    for td, id in zip(tds, ids):
        tag1 = "{}.1".format(td)
        tag2 = "{}.2".format(td)
        if tag2 not in df:
            afs.append("{}")
            continue
        tags.append(tag2)
        final_columns.append(id)
        a = np.array(list(df[tag1]) + list(df[tag2]))
        counts = alleles_to_counts(a)
        af = counts_to_af(counts)
        afs.append(af)

    tf["allele_frequency"] = afs

    metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
    tf.to_csv(metafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(metafile))
    if opts.csv:
        metacsvfile = metafile.rsplit(".", 1)[0] + ".csv"
        tf.to_csv(metacsvfile, index=False)
        logging.debug("File `{}` written.".format(metacsvfile))

    pp = df[tags]
    pp.columns = final_columns
    datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
    pp.to_csv(datafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(datafile))

    mask([datafile, metafile])
Пример #2
0
def treds(args):
    """
    %prog treds hli.tred.tsv

    Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and
    mask.tsv in one go.
    """
    p = OptionParser(treds.__doc__)
    p.add_option("--csv", default=False, action="store_true",
                 help="Also write `meta.csv`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    tredresults, = args
    df = pd.read_csv(tredresults, sep="\t")

    tredsfile = op.join(datadir, "TREDs.meta.hg38.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    ids = list(tf["id"])
    tags = ["SampleKey"]
    final_columns = ["SampleKey"]
    afs = []
    for td, id in zip(tds, ids):
        tag1 = "{}.1".format(td)
        tag2 = "{}.2".format(td)
        if tag2 not in df:
            afs.append("{}")
            continue
        tags.append(tag2)
        final_columns.append(id)
        a = np.array(list(df[tag1]) + list(df[tag2]))
        counts = alleles_to_counts(a)
        af = counts_to_af(counts)
        afs.append(af)

    tf["allele_frequency"] = afs

    metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
    tf.to_csv(metafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(metafile))
    if opts.csv:
        metacsvfile = metafile.rsplit(".", 1)[0] + ".csv"
        tf.to_csv(metacsvfile, index=False)
        logging.debug("File `{}` written.".format(metacsvfile))

    pp = df[tags]
    pp.columns = final_columns
    datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
    pp.to_csv(datafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(datafile))

    mask([datafile, metafile])
Пример #3
0
def stop(args):
    """
    %prog stop

    Stop EC2 instance.
    """
    p = OptionParser(stop.__doc__)
    p.add_option("--profile", default="mvrad-datasci-role", help="Profile name")
    opts, args = p.parse_args(args)

    if len(args) != 0:
        sys.exit(not p.print_help())

    role(["htang"])
    session = boto3.Session(profile_name=opts.profile)
    client = session.client("ec2")
    s = InstanceSkeleton()

    # Make sure the instance id is NOT empty
    instance_id = s.instance_id
    if instance_id == "":
        logging.error("Cannot find instance_id {}".format(instance_id))
        sys.exit(1)

    block_device_mappings = []
    for volume in s.volumes:
        block_device_mappings.append({"DeviceName": volume["Device"], "NoDevice": ""})

    new_image_name = "htang-dev-{}-{}".format(timestamp(), int(time.time()))
    response = client.create_image(
        InstanceId=instance_id,
        Name=new_image_name,
        BlockDeviceMappings=block_device_mappings,
    )
    print(response, file=sys.stderr)
    new_image_id = response["ImageId"]

    image_status = ""
    while image_status != "available":
        logging.debug("Waiting for image to be ready")
        time.sleep(10)
        response = client.describe_images(ImageIds=[new_image_id])
        image_status = response["Images"][0]["State"]

    # Delete old image, snapshot and shut down instance
    old_image_id = s.image_id
    response = client.describe_images(ImageIds=[old_image_id])
    old_snapshot_id = response["Images"][0]["BlockDeviceMappings"][0]["Ebs"][
        "SnapshotId"
    ]
    response = client.deregister_image(ImageId=old_image_id)
    print(response, file=sys.stderr)
    response = client.delete_snapshot(SnapshotId=old_snapshot_id)
    print(response, file=sys.stderr)
    response = client.terminate_instances(InstanceIds=[instance_id])
    print(response, file=sys.stderr)

    # Save new image id
    s.save_image_id(new_image_id)
    s.save_instance_id("", "")
Пример #4
0
def stop(args):
    """
    %prog stop

    Stop EC2 instance.
    """
    p = OptionParser(stop.__doc__)
    p.add_option("--profile", default="mvrad-datasci-role", help="Profile name")
    opts, args = p.parse_args(args)

    if len(args) != 0:
        sys.exit(not p.print_help())

    role(["205134639408", "htang", "114692162163", "mvrad-datasci-role"])
    session = boto3.Session(profile_name=opts.profile)
    client = session.client('ec2')
    s = InstanceSkeleton()

    # Create image
    instance_id = s.instance_id
    block_device_mappings = []
    for volume in s.volumes:
        block_device_mappings.append(
            {
                "DeviceName": volume["Device"],
                "NoDevice": ""
            }
        )

    new_image_name = "htang-dev-{}-{}".format(timestamp(), int(time.time()))
    response = client.create_image(
        InstanceId=instance_id,
        Name=new_image_name,
        BlockDeviceMappings=block_device_mappings
    )
    print >> sys.stderr, response
    new_image_id = response["ImageId"]

    image_status = ""
    while image_status != "available":
        logging.debug("Waiting for image to be ready")
        time.sleep(10)
        response = client.describe_images(ImageIds=[new_image_id])
        image_status = response["Images"][0]["State"]

    # Delete old image, snapshot and shut down instance
    old_image_id = s.image_id
    response = client.describe_images(ImageIds=[old_image_id])
    old_snapshot_id = response["Images"][0]["BlockDeviceMappings"][0]["Ebs"]["SnapshotId"]
    response = client.deregister_image(ImageId=old_image_id)
    print >> sys.stderr, response
    response = client.delete_snapshot(SnapshotId=old_snapshot_id)
    print >> sys.stderr, response
    response = client.terminate_instances(InstanceIds=[instance_id])
    print >> sys.stderr, response

    # Save new image id
    s.save_image_id(new_image_id)
    s.save_instance_id("")
Пример #5
0
def treds(args):
    """
    %prog treds hli.tred.tsv

    Compile allele_frequency for TREDs results. Write data.tsv, meta.tsv and
    mask.tsv in one go.
    """
    p = OptionParser(treds.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    tredresults, = args
    df = pd.read_csv(tredresults, sep="\t")

    tredsfile = op.join(datadir, "TREDs.meta.hg38.csv")
    tf = pd.read_csv(tredsfile)

    tds = list(tf["abbreviation"])
    ids = list(tf["id"])
    tags = ["SampleKey"]
    final_columns = ["SampleKey"]
    afs = []
    for td, id in zip(tds, ids):
        tag = "{}.2".format(td)
        tags.append(tag)
        a = df["{}.2".format(td)]
        final_columns.append(id)
        counts = alleles_to_counts(a)
        af = counts_to_af(counts)
        afs.append(af)

    tf["allele_frequency"] = afs

    metafile = "TREDs_{}_SEARCH.meta.tsv".format(timestamp())
    tf.to_csv(metafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(metafile))

    pp = df[tags]
    pp.columns = final_columns
    datafile = "TREDs_{}_SEARCH.data.tsv".format(timestamp())
    pp.to_csv(datafile, sep="\t", index=False)
    logging.debug("File `{}` written.".format(datafile))

    mask([datafile, metafile])
Пример #6
0
def data(args):
    """
    %prog data data.bin samples.ids STR.ids meta.tsv

    Make data.tsv based on meta.tsv.
    """
    p = OptionParser(data.__doc__)
    p.add_option(
        "--notsv", default=False, action="store_true", help="Do not write data.tsv"
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    databin, sampleids, strids, metafile = args
    final_columns, percentiles = read_meta(metafile)
    df, m, samples, loci = read_binfile(databin, sampleids, strids)

    # Clean the data
    m %= 1000  # Get the larger of the two alleles
    m[m == 999] = -1  # Missing data

    final = set(final_columns)
    remove = []
    for i, locus in enumerate(loci):
        if locus not in final:
            remove.append(locus)
            continue

    pf = "STRs_{}_SEARCH".format(timestamp())
    filteredstrids = "{}.STR.ids".format(pf)
    fw = open(filteredstrids, "w")
    print("\n".join(final_columns), file=fw)
    fw.close()
    logging.debug(
        "Dropped {} columns; Retained {} columns (`{}`)".format(
            len(remove), len(final_columns), filteredstrids
        )
    )

    # Remove low-quality columns!
    df.drop(remove, inplace=True, axis=1)
    df.columns = final_columns

    filtered_bin = "{}.data.bin".format(pf)
    if need_update(databin, filtered_bin):
        m = df.as_matrix()
        m.tofile(filtered_bin)
        logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    filtered_tsv = "{}.data.tsv".format(pf)
    if not opts.notsv and need_update(databin, filtered_tsv):
        df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
Пример #7
0
def data(args):
    """
    %prog data data.bin samples.ids STR.ids meta.tsv

    Make data.tsv based on meta.tsv.
    """
    p = OptionParser(data.__doc__)
    p.add_option("--notsv", default=False, action="store_true",
                 help="Do not write data.tsv")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    databin, sampleids, strids, metafile = args
    final_columns, percentiles = read_meta(metafile)
    df, m, samples, loci = read_binfile(databin, sampleids, strids)

    # Clean the data
    m %= 1000  # Get the larger of the two alleles
    m[m == 999] = -1  # Missing data

    final = set(final_columns)
    remove = []
    for i, locus in enumerate(loci):
        if locus not in final:
            remove.append(locus)
            continue

    pf = "STRs_{}_SEARCH".format(timestamp())
    filteredstrids = "{}.STR.ids".format(pf)
    fw = open(filteredstrids, "w")
    print >> fw, "\n".join(final_columns)
    fw.close()
    logging.debug("Dropped {} columns; Retained {} columns (`{}`)".\
                    format(len(remove), len(final_columns), filteredstrids))

    # Remove low-quality columns!
    df.drop(remove, inplace=True, axis=1)
    df.columns = final_columns

    filtered_bin = "{}.data.bin".format(pf)
    if need_update(databin, filtered_bin):
        m = df.as_matrix()
        m.tofile(filtered_bin)
        logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    filtered_tsv = "{}.data.tsv".format(pf)
    if not opts.notsv and need_update(databin, filtered_tsv):
        df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
Пример #8
0
def get_vcfstanza(fastafile, fasta, sampleid="SAMP_001"):
    from jcvi.formats.base import timestamp
    # VCF spec
    m = "##fileformat=VCFv4.1\n"
    m += "##fileDate={0}\n".format(timestamp())
    m += "##source={0}\n".format(__file__)
    m += "##reference=file://{0}\n".format(op.abspath(fastafile).strip("/"))
    m += '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional genotype">\n'
    m += '##INFO=<ID=IM,Number=0,Type=Flag,Description="Imputed genotype">\n'
    m += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
    m += '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Genotype Probability">\n'
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [sampleid]
    m += "#" + "\t".join(header)
    return m
Пример #9
0
def get_vcfstanza(fastafile, fasta, sampleid="SAMP_001"):
    from jcvi.formats.base import timestamp
    # VCF spec
    m = "##fileformat=VCFv4.1\n"
    m += "##fileDate={0}\n".format(timestamp())
    m += "##source={0}\n".format(__file__)
    m += "##reference=file://{0}\n".format(op.abspath(fastafile).strip("/"))
    m += '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional genotype">\n'
    m += '##INFO=<ID=IM,Number=0,Type=Flag,Description="Imputed genotype">\n'
    m += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
    m += '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Genotype Probability">\n'
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [
        sampleid
    ]
    m += "#" + "\t".join(header)
    return m
Пример #10
0
def mask(args):
    """
    %prog mask data.bin samples.ids STR.ids meta.tsv

    OR

    %prog mask data.tsv meta.tsv

    Compute P-values based on meta and data. The `data.bin` should be the matrix
    containing filtered loci and the output mask.tsv will have the same
    dimension.
    """
    p = OptionParser(mask.__doc__)
    opts, args = p.parse_args(args)

    if len(args) not in (2, 4):
        sys.exit(not p.print_help())

    if len(args) == 4:
        databin, sampleids, strids, metafile = args
        df, m, samples, loci = read_binfile(databin, sampleids, strids)
        mode = "STRs"
    elif len(args) == 2:
        databin, metafile = args
        df = pd.read_csv(databin, sep="\t", index_col=0)
        m = df.as_matrix()
        samples = df.index
        loci = list(df.columns)
        mode = "TREDs"

    pf = "{}_{}_SEARCH".format(mode, timestamp())
    final_columns, percentiles = read_meta(metafile)

    maskfile = pf + ".mask.tsv"
    run_args = []
    for i, locus in enumerate(loci):
        a = m[:, i]
        percentile = percentiles[locus]
        run_args.append((i, a, percentile))

    if mode == "TREDs" or need_update(databin, maskfile):
        cpus = min(8, len(run_args))
        write_mask(cpus, samples, final_columns, run_args, filename=maskfile)
        logging.debug("File `{}` written.".format(maskfile))
Пример #11
0
def mask(args):
    """
    %prog mask data.bin samples.ids STR.ids meta.tsv

    OR

    %prog mask data.tsv meta.tsv

    Compute P-values based on meta and data. The `data.bin` should be the matrix
    containing filtered loci and the output mask.tsv will have the same
    dimension.
    """
    p = OptionParser(mask.__doc__)
    opts, args = p.parse_args(args)

    if len(args) not in (2, 4):
        sys.exit(not p.print_help())

    if len(args) == 4:
        databin, sampleids, strids, metafile = args
        df, m, samples, loci = read_binfile(databin, sampleids, strids)
        mode = "STRs"
    elif len(args) == 2:
        databin, metafile = args
        df = pd.read_csv(databin, sep="\t", index_col=0)
        m = df.as_matrix()
        samples = df.index
        loci = list(df.columns)
        mode = "TREDs"

    pf = "{}_{}_SEARCH".format(mode, timestamp())
    final_columns, percentiles = read_meta(metafile)

    maskfile = pf + ".mask.tsv"
    run_args = []
    for i, locus in enumerate(loci):
        a = m[:, i]
        percentile = percentiles[locus]
        run_args.append((i, a, percentile))

    if mode == "TREDs" or need_update(databin, maskfile):
        cpus = min(8, len(run_args))
        write_mask(cpus, samples, final_columns, run_args, filename=maskfile)
        logging.debug("File `{}` written.".format(maskfile))
Пример #12
0
def meta(args):
    """
    %prog meta data.bin samples STR.ids STR-exons.wo.bed

    Compute allele frequencies and prune sites based on missingness.

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(meta.__doc__)
    p.add_option("--cutoff",
                 default=.5,
                 type="float",
                 help="Percent observed required (chrY half cutoff)")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, wobed = args
    cutoff = opts.cutoff

    af_file = "allele_freq"
    if need_update(binfile, af_file):
        df, m, samples, loci = read_binfile(binfile, sampleids, strids)
        nalleles = len(samples)
        fw = must_open(af_file, "w")
        for i, locus in enumerate(loci):
            a = m[:, i]
            counts = alleles_to_counts(a)
            af = counts_to_af(counts)
            seqid = locus.split("_")[0]
            remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff)
            print("\t".join((locus, af, remove)), file=fw)
        fw.close()

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst)

    TREDS, df = read_treds()

    metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
    write_meta(af_file, gene_map, TREDS, filename=metafile)
    logging.debug("File `{}` written.".format(metafile))
Пример #13
0
def meta(args):
    """
    %prog meta data.bin samples STR.ids STR-exons.wo.bed

    Compute allele frequencies and prune sites based on missingness.

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(meta.__doc__)
    p.add_option("--cutoff", default=.5, type="float",
                 help="Percent observed required (chrY half cutoff)")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, wobed = args
    cutoff = opts.cutoff

    af_file = "allele_freq"
    if need_update(binfile, af_file):
        df, m, samples, loci = read_binfile(binfile, sampleids, strids)
        nalleles = len(samples)
        fw = must_open(af_file, "w")
        for i, locus in enumerate(loci):
            a = m[:, i]
            counts = alleles_to_counts(a)
            af = counts_to_af(counts)
            seqid = locus.split("_")[0]
            remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff)
            print >> fw, "\t".join((locus, af, remove))
        fw.close()

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst)

    tredsfile = op.join(datadir, "TREDs.meta.hg38.csv")
    TREDS = read_treds(tredsfile)

    metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
    write_meta(af_file, gene_map, TREDS, filename=metafile)
    logging.debug("File `{}` written.".format(metafile))