mt = mt.annotate_rows(
    main_transcript=
    get_expr_for_worst_transcript_consequence_annotations_struct(
        vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences
    ))

mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set(
    vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )

review_status_str = hl.delimit(
    hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)),
              key=lambda s: s.replace("^_", "z")))

mt = mt.select_rows(
    allele_id=mt.info.ALLELEID,
    alt=get_expr_for_alt_allele(mt),
    chrom=get_expr_for_contig(mt.locus),
    clinical_significance=hl.delimit(
        hl.sorted(hl.array(hl.set(mt.info.CLNSIG)),
                  key=lambda s: s.replace("^_", "z"))),
    domains=get_expr_for_vep_protein_domains_set(
        vep_transcript_consequences_root=mt.vep.transcript_consequences),
    gene_ids=mt.gene_ids,
    gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
        vep_sorted_transcript_consequences_root=mt.
        sortedTranscriptConsequences,
        gene_ids=mt.gene_ids),
    gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str],
    **{
        f"main_transcript_{field}": mt.main_transcript[field]
        for field in mt.main_transcript.dtype.fields
Пример #2
0
        )
        for histogram in [
            "ab_hist_alt",
            "dp_hist_all",
            "dp_hist_alt",
            "gq_hist_all",
            "gq_hist_alt",
            "gnomad_age_hist_het",
            "gnomad_age_hist_hom",
        ]
    }
)

# Derived top level fields
ds = ds.annotate(
    alt=get_expr_for_alt_allele(ds),
    chrom=get_expr_for_contig(ds),
    pos=get_expr_for_start_pos(ds),
    ref=get_expr_for_ref_allele(ds),
    variant_id=get_expr_for_variant_id(ds),
    xpos=get_expr_for_xpos(ds),
)

###########
# Subsets #
###########

all_subsets = ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]

# There is no separate non-cancer subset for genome data. All genome samples are non-cancer.
subsets = [s for s in all_subsets if f"{s}_AC_adj" in ds.row_value.dtype.fields]
Пример #3
0
p = argparse.ArgumentParser()
p.add_argument("--variant-results-url", required=True)
p.add_argument("--variant-annotations-url", required=True)
p.add_argument("--output-url", required=True)
args = p.parse_args()

hl.init(log="/tmp/hail.log")

variants = hl.read_table(args.variant_annotations_url)
variants = variants.annotate(
    variant_id=get_expr_for_variant_id(variants),
    chrom=get_expr_for_contig(variants.locus),
    pos=variants.locus.position,
    xpos=get_expr_for_xpos(variants.locus),
    alt=get_expr_for_alt_allele(variants),
    ref=get_expr_for_ref_allele(variants),
)
variants = variants.transmute(
    transcript_id=hl.delimit(variants.transcript_id, ","),
    hgvsc=hl.delimit(
        variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","),
    hgvsp=hl.delimit(
        variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","),
)
variants = variants.annotate(flags="PASS")
variants = variants.drop("v")

results = hl.read_table(args.variant_results_url)
results = results.annotate(analysis_group=results.analysis_group.lower().
                           replace("[^a-z0-9]+", "_").replace("_+$", ""))