mt = mt.annotate_rows( main_transcript= get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences )) mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), ) review_status_str = hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z"))) mt = mt.select_rows( allele_id=mt.info.ALLELEID, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), clinical_significance=hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))), domains=get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root=mt.vep.transcript_consequences), gene_ids=mt.gene_ids, gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences, gene_ids=mt.gene_ids), gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str], **{ f"main_transcript_{field}": mt.main_transcript[field] for field in mt.main_transcript.dtype.fields
) for histogram in [ "ab_hist_alt", "dp_hist_all", "dp_hist_alt", "gq_hist_all", "gq_hist_alt", "gnomad_age_hist_het", "gnomad_age_hist_hom", ] } ) # Derived top level fields ds = ds.annotate( alt=get_expr_for_alt_allele(ds), chrom=get_expr_for_contig(ds), pos=get_expr_for_start_pos(ds), ref=get_expr_for_ref_allele(ds), variant_id=get_expr_for_variant_id(ds), xpos=get_expr_for_xpos(ds), ) ########### # Subsets # ########### all_subsets = ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"] # There is no separate non-cancer subset for genome data. All genome samples are non-cancer. subsets = [s for s in all_subsets if f"{s}_AC_adj" in ds.row_value.dtype.fields]
p = argparse.ArgumentParser() p.add_argument("--variant-results-url", required=True) p.add_argument("--variant-annotations-url", required=True) p.add_argument("--output-url", required=True) args = p.parse_args() hl.init(log="/tmp/hail.log") variants = hl.read_table(args.variant_annotations_url) variants = variants.annotate( variant_id=get_expr_for_variant_id(variants), chrom=get_expr_for_contig(variants.locus), pos=variants.locus.position, xpos=get_expr_for_xpos(variants.locus), alt=get_expr_for_alt_allele(variants), ref=get_expr_for_ref_allele(variants), ) variants = variants.transmute( transcript_id=hl.delimit(variants.transcript_id, ","), hgvsc=hl.delimit( variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","), hgvsp=hl.delimit( variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","), ) variants = variants.annotate(flags="PASS") variants = variants.drop("v") results = hl.read_table(args.variant_results_url) results = results.annotate(analysis_group=results.analysis_group.lower(). replace("[^a-z0-9]+", "_").replace("_+$", ""))