def prepare_variant_results(results_url, annotations_url): variant_results = hl.read_table(results_url) # Get unique variants from results table variants = variant_results.group_by(variant_results.locus, variant_results.alleles).aggregate() # Select AC/AF numbers for the alternate allele variant_results = variant_results.annotate( ac_case=variant_results.ac_case[1], af_case=variant_results.af_case[1], ac_ctrl=variant_results.ac_ctrl[1], af_ctrl=variant_results.af_ctrl[1], ) # Rename analysis groups to be Elasticsearch-friendly variant_results = variant_results.annotate( analysis_group=GROUP_NAMES[variant_results.analysis_group]) # Annotate variants with a struct for each analysis group variants = variants.annotate(groups=hl.struct()) analysis_groups = variant_results.aggregate( hl.agg.collect_as_set(variant_results.analysis_group)) for group in analysis_groups: group_results = variant_results.filter( variant_results.analysis_group == group).drop( "analysis_group", "variant_id") variants = variants.annotate(groups=variants.groups.annotate( **{group: group_results[variants.locus, variants.alleles]})) # Merge variant annotations for canonical transcripts variant_annotations = hl.read_table(annotations_url) variant_annotations = variant_annotations.drop("variant_id") variant_annotations = variant_annotations.filter( variant_annotations.transcript_id == variant_annotations.canonical_transcript_id) variants = variants.annotate(**variant_annotations[variants.locus, variants.alleles]) variants = variants.annotate( chrom=variants.locus.contig[3:], pos=variants.locus.position, xpos=x_position(variants.locus), ) variants = variants.annotate( variant_id=variants.chrom + "-" + hl.str(variants.pos) + "-" + variants.alleles[0] + "-" + variants.alleles[1]) return variants
def format_coverage_table(ds): ds = ds.select( chrom=normalized_contig(ds.locus), pos=ds.locus.position, xpos=x_position(ds.locus), mean=ds.mean, median=ds.median, over1=ds.over_1, over5=ds.over_5, over10=ds.over_10, over15=ds.over_15, over20=ds.over_20, over25=ds.over_25, over30=ds.over_30, over50=ds.over_50, over100=ds.over_100, ) ds = ds.key_by().drop("locus") return ds
def format_variants_table(ds): ############################ # Derived top level fields # ############################ ds = ds.annotate( variant_id=variant_id(ds.locus, ds.alleles), chrom=normalized_contig(ds.locus), pos=ds.locus.position, xpos=x_position(ds.locus), ref=ds.alleles[0], alt=ds.alleles[1], ) ############### # Frequencies # ############### g = hl.eval(ds.globals) freq_index_tree = get_freq_index_tree(g.freq_index_dict) subsets = list(freq_index_tree.keys()) ds = ds.annotate( **{ subset: hl.struct( # Adjusted frequencies AC_adj=freq_expression(ds, "AC", freq_index_tree[subset]), AN_adj=freq_expression(ds, "AN", freq_index_tree[subset]), AF_adj=freq_expression(ds, "AF", freq_index_tree[subset]), nhomalt_adj=freq_expression(ds, "homozygote_count", freq_index_tree[subset]), # Raw frequencies AC_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC, AN_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AN, AF_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AF, nhomalt_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].homozygote_count, # Popmax popmax=ds.popmax[g.popmax_index_dict[subset]].pop, AC_popmax=ds.popmax[g.popmax_index_dict[subset]].AC, AN_popmax=ds.popmax[g.popmax_index_dict[subset]].AN, AF_popmax=ds.popmax[g.popmax_index_dict[subset]].AF, nhomalt_popmax=ds.popmax[g.popmax_index_dict[subset]].homozygote_count, ) for subset in subsets } ) ############################## # Filtering allele frequency # ############################## faf_index_tree = collections.defaultdict(dict) for labels_combo, index in g.faf_index_dict.items(): labels = labels_combo.split("_") # Subset labels contain an _, so rebuild those after splitting them if labels[0] == "non": labels = ["_".join(labels[0:2])] + labels[2:] if len(labels) == 2: [subset, pop] = labels faf_index_tree[subset][pop] = index else: assert len(labels) == 1 subset = labels[0] faf_index_tree[subset]["total"] = index ds = ds.annotate( **{ subset: ds[subset].annotate( faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}), faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}), ) for subset in subsets } ) ds = ds.drop("freq", "popmax", "faf") ############## # Histograms # ############## # Extract overall age distribution ds = ds.transmute( gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]], gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]], ) # Convert lists of numbers in histograms into pipe delimited strings ds = ds.annotate( **{ field: ds[field].annotate( bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|") ) for field in [ "ab_hist_alt", "dp_hist_all", "dp_hist_alt", "gq_hist_all", "gq_hist_alt", "gnomad_age_hist_het", "gnomad_age_hist_hom", ] } ) ########################### # Quality metrics / flags # ########################### # Use the same fields as the VCFs # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159 ds = ds.transmute( BaseQRankSum=ds.allele_info.BaseQRankSum, ClippingRankSum=ds.allele_info.ClippingRankSum, DP=ds.allele_info.DP, FS=ds.info_FS, InbreedingCoeff=ds.info_InbreedingCoeff, MQ=ds.info_MQ, MQRankSum=ds.info_MQRankSum, QD=ds.info_QD, ReadPosRankSum=ds.info_ReadPosRankSum, rf_negative_label=ds.fail_hard_filters, rf_positive_label=ds.tp, rf_tp_probability=ds.rf_probability, SOR=ds.info_SOR, VQSLOD=ds.allele_info.VQSLOD, VQSR_culprit=ds.allele_info.culprit, VQSR_NEGATIVE_TRAIN_SITE=ds.info_NEGATIVE_TRAIN_SITE, VQSR_POSITIVE_TRAIN_SITE=ds.info_POSITIVE_TRAIN_SITE, ) # These fields are left unaltered at the top level # # allele_type # decoy # has_star # lcr # n_alt_alleles # nonpar # pab_max # rf_label # rf_train # segdup # transmitted_singleton # variant_type # was_mixed # TODO: Remove this, leave these at top level ds = ds.transmute( allele_info=hl.struct( BaseQRankSum=ds.BaseQRankSum, ClippingRankSum=ds.ClippingRankSum, DP=ds.DP, FS=ds.FS, InbreedingCoeff=ds.InbreedingCoeff, MQ=ds.MQ, MQRankSum=ds.MQRankSum, QD=ds.QD, ReadPosRankSum=ds.ReadPosRankSum, SOR=ds.SOR, VQSLOD=ds.VQSLOD, VQSR_culprit=ds.VQSR_culprit, VQSR_NEGATIVE_TRAIN_SITE=ds.VQSR_NEGATIVE_TRAIN_SITE, VQSR_POSITIVE_TRAIN_SITE=ds.VQSR_POSITIVE_TRAIN_SITE, ) ) ################### # VEP annotations # ################### ds = ds.annotate(sortedTranscriptConsequences=sorted_transcript_consequences_v2(ds.vep)) ds = ds.drop("vep") ######### # Flags # ######### # TODO: Leave these at the top level ds = ds.transmute(flags=hl.struct(lcr=ds.lcr, segdup=ds.segdup)) # TODO: Remove this, these flags are calculated on the fly ds = ds.annotate( flags=ds.flags.annotate( lc_lof=get_expr_for_variant_lc_lof_flag(ds.sortedTranscriptConsequences), lof_flag=get_expr_for_variant_loftee_flag_flag(ds.sortedTranscriptConsequences), ), sortedTranscriptConsequences=hl.bind( lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: ds.sortedTranscriptConsequences.map( lambda csq: csq.annotate( flags=hl.struct( lc_lof=get_expr_for_consequence_lc_lof_flag(csq), lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id), lof_flag=get_expr_for_consequence_loftee_flag_flag(csq), lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id), nc_transcript=(csq.category == "lof") & (csq.lof == ""), ) ) ), get_expr_for_genes_with_lc_lof_flag(ds.sortedTranscriptConsequences), get_expr_for_genes_with_loftee_flag_flag(ds.sortedTranscriptConsequences), ), ) ################# # Unused fields # ################# # These fields were not in the 2.1.1 browser Hail table ds = ds.drop( "adj_biallelic_rank", "adj_biallelic_singleton_rank", "adj_rank", "adj_singleton_rank", "biallelic_rank", "biallelic_singleton_rank", "info_DP", "mills", "n_nonref", "omni", "qd", "rank", "score", "singleton_rank", "singleton", "was_split", ) # These two fields appear only in the genomes table if "_score" in ds.row_value.dtype.fields: ds = ds.drop("_score", "_singleton") ######## # Keys # ######## # Drop key fields ds = ds.key_by().drop("locus", "alleles") return ds
def import_mnv_file(path, **kwargs): column_types = { "AC_mnv_ex": hl.tint, "AC_mnv_gen": hl.tint, "AC_mnv": hl.tint, "AC_snp1_ex": hl.tint, "AC_snp1_gen": hl.tint, "AC_snp1": hl.tint, "AC_snp2_ex": hl.tint, "AC_snp2_gen": hl.tint, "AC_snp2": hl.tint, "AN_snp1_ex": hl.tfloat, "AN_snp1_gen": hl.tfloat, "AN_snp2_ex": hl.tfloat, "AN_snp2_gen": hl.tfloat, "categ": hl.tstr, "filter_snp1_ex": hl.tarray(hl.tstr), "filter_snp1_gen": hl.tarray(hl.tstr), "filter_snp2_ex": hl.tarray(hl.tstr), "filter_snp2_gen": hl.tarray(hl.tstr), "gene_id": hl.tstr, "gene_name": hl.tstr, "locus.contig": hl.tstr, "locus.position": hl.tint, "mnv_amino_acids": hl.tstr, "mnv_codons": hl.tstr, "mnv_consequence": hl.tstr, "mnv_lof": hl.tstr, "mnv": hl.tstr, "n_homhom_ex": hl.tint, "n_homhom_gen": hl.tint, "n_homhom": hl.tint, "n_indv_ex": hl.tint, "n_indv_gen": hl.tint, "n_indv": hl.tint, "snp1_amino_acids": hl.tstr, "snp1_codons": hl.tstr, "snp1_consequence": hl.tstr, "snp1_lof": hl.tstr, "snp1": hl.tstr, "snp2_amino_acids": hl.tstr, "snp2_codons": hl.tstr, "snp2_consequence": hl.tstr, "snp2_lof": hl.tstr, "snp2": hl.tstr, "transcript_id": hl.tstr, } ds = hl.import_table(path, key="mnv", missing="", types=column_types, **kwargs) ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"])) ds = ds.transmute( contig=normalized_contig(ds.locus), pos=ds.locus.position, xpos=x_position(ds.locus), ) ds = ds.annotate(ref=ds.mnv.split("-")[2], alt=ds.mnv.split("-")[3], variant_id=ds.mnv) ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2) ds = ds.transmute(constituent_snvs=[ hl.bind( lambda variant_id_parts: hl.struct( variant_id=ds[f"{snp}_copy"], chrom=variant_id_parts[0], pos=hl.int(variant_id_parts[1]), ref=variant_id_parts[2], alt=variant_id_parts[3], exome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_ex"]), hl.struct( filters=ds[f"filter_{snp}_ex"], ac=ds[f"AC_{snp}_ex"], an=hl.int(ds[f"AN_{snp}_ex"]), ), ), genome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_gen"]), hl.struct( filters=ds[f"filter_{snp}_gen"], ac=ds[f"AC_{snp}_gen"], an=hl.int(ds[f"AN_{snp}_gen"]), ), ), ), ds[f"{snp}_copy"].split("-"), ) for snp in ["snp1", "snp2"] ]) ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2]) ds = ds.annotate( mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)), mnv_in_genome=ds.constituent_snvs.all( lambda s: hl.is_defined(s.genome)), ) ds = ds.transmute( n_individuals=ds.n_indv, ac=ds.AC_mnv, ac_hom=ds.n_homhom, exome=hl.or_missing( ds.mnv_in_exome, hl.struct(n_individuals=ds.n_indv_ex, ac=ds.AC_mnv_ex, ac_hom=ds.n_homhom_ex), ), genome=hl.or_missing( ds.mnv_in_genome, hl.struct(n_individuals=ds.n_indv_gen, ac=ds.AC_mnv_gen, ac_hom=ds.n_homhom_gen), ), ) ds = ds.drop("AC_snp1", "AC_snp2") ds = ds.transmute(consequence=hl.struct( category=ds.categ, gene_id=ds.gene_id, gene_name=ds.gene_name, transcript_id=ds.transcript_id, consequence=ds.mnv_consequence, codons=ds.mnv_codons, amino_acids=ds.mnv_amino_acids, lof=ds.mnv_lof, snv_consequences=[ hl.struct( variant_id=ds[f"{snp}"], amino_acids=ds[f"{snp}_amino_acids"], codons=ds[f"{snp}_codons"], consequence=ds[f"{snp}_consequence"], lof=ds[f"{snp}_lof"], ) for snp in ["snp1", "snp2"] ], )) # Collapse table to one row per MNV, with all consequences for the MNV collected into an array consequences = ds.group_by( ds.mnv).aggregate(consequences=hl.agg.collect(ds.consequence)) ds = ds.drop("consequence") ds = ds.distinct() ds = ds.join(consequences) # Sort consequences by severity ds = ds.annotate(consequences=hl.sorted( ds.consequences, key=lambda c: consequence_term_rank(c.consequence), )) ds = ds.annotate(changes_amino_acids_for_snvs=hl.literal([0, 1]).filter( lambda idx: ds.consequences.any(lambda csq: csq.snv_consequences[ idx].amino_acids.lower() != csq.amino_acids.lower())).map( lambda idx: ds.constituent_snv_ids[idx])) return ds
def main(): parser = argparse.ArgumentParser() parser.add_argument("--results", required=True) parser.add_argument("--annotations", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() hl.init(log="/tmp/hail.log") variants = hl.read_table(args.annotations) variants = variants.annotate( variant_id=variant_id(variants.locus, variants.alleles), chrom=variants.locus.contig, pos=variants.locus.position, xpos=x_position(variants.locus), alt=variants.alleles[1], ref=variants.alleles[0], ) variants = variants.transmute( transcript_id=hl.delimit(variants.transcript_id, ","), hgvsc=hl.delimit( variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","), hgvsp=hl.delimit( variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","), ) variants = variants.annotate( csq_canonical=hl.case().when((variants.csq_canonical == "mis") & (variants.mpc >= 3), "mis3"). when((variants.csq_canonical == "mis") & (variants.mpc >= 2), "mis2").default(variants.csq_canonical)) variants = variants.annotate(flags="PASS") variants = variants.drop("v") results = hl.read_table(args.results) results = results.annotate( analysis_group=results.analysis_group.lower().replace( "[^a-z0-9]+", "_").replace("_+$", "")) results = results.drop("v") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( af_case=hl.cond(results.an_case == 0, 0, results.ac_case / results.an_case)) variants = variants.filter(hl.is_defined(results[variants.key])) analysis_groups = results.aggregate( hl.agg.collect_as_set(results.analysis_group)) variants = variants.annotate(groups=hl.struct()) for group in analysis_groups: group_results = results.filter( results.analysis_group == group).drop("analysis_group") variants = variants.annotate(groups=variants.groups.annotate( **{group: group_results[variants.key]})) # The latest (2019/04/15) SCHEMA dataset moved the source and in_analysis field from variant level to group level # in_analysis is the same for all groups within a variant, but source is not variants = variants.annotate(in_analysis=variants.groups.meta.in_analysis, source=variants.groups.meta.source) variants.write(args.output)
def format_clinvar_variants(ds): # There are some variants with only one entry in alleles, ignore them for now. # TODO: These could be displayed in the ClinVar track even though they will never match a gnomAD variant. ds = ds.filter(hl.len(ds.alleles) == 2) # When a cluster is started with hailctl dataproc start cluster_name --vep, the init script for the # selected version of VEP links the appropriate configuration file to /vep_data/vep-gcloud.json ds = hl.vep(ds, "file:///vep_data/vep-gcloud.json", name="vep", block_size=1000) ds = ds.annotate(sorted_transcript_consequences=sorted_transcript_consequences_v3(ds.vep)) ds = ds.drop("vep") ds = ds.select( clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), clinvar_variation_id=ds.rsid, gold_stars=get_gold_stars(ds.info.CLNREVSTAT), review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), sorted_transcript_consequences=ds.sorted_transcript_consequences, ) ds = ds.annotate( chrom=normalized_contig(ds.locus), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus) ) return ds
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input-url", help="URL of ExAC sites VCF", default= "gs://gnomad-public/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz") parser.add_argument("--output-url", help="URL to write Hail table to", required=True) parser.add_argument("--subset", help="Filter variants to this chrom:start-end range") args = parser.parse_args() hl.init(log="/tmp/hail.log") print("\n=== Importing VCF ===") ds = hl.import_vcf(args.input_url, force_bgz=True, min_partitions=2000, skip_invalid_loci=True).rows() if args.subset: print(f"\n=== Filtering to interval {args.subset} ===") subset_interval = hl.parse_locus_interval(args.subset) ds = ds.filter(subset_interval.contains(ds.locus)) print("\n=== Splitting multiallelic variants ===") ds = hl.split_multi(ds) ds = ds.repartition(2000, shuffle=True) # Get value corresponding to the split variant ds = ds.annotate(info=ds.info.annotate( **{ field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ ds.a_index - 1]) for field in PER_ALLELE_FIELDS })) # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals, # which is the same in each alt allele's variant. ds = ds.annotate(info=ds.info.annotate( DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]), GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]), )) ds = ds.cache() print("\n=== Munging data ===") # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): ds = ds.annotate(info=ds.info.annotate( **{ field: hl.or_missing( hl.is_defined(ds.info[field]), hl.bind( lambda value: hl.cond( (value == "") | (value == "NA"), hl.null(ds.info[field].dtype), ds.info[field]), hl.str(ds.info[field]), ), ) for field in SELECT_INFO_FIELDS[i:i + 10] })) # Convert field types ds = ds.annotate(info=ds.info.annotate( **{ field: hl.cond(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field])) for field in CONVERT_TO_INT_FIELDS })) ds = ds.annotate(info=ds.info.annotate( **{ field: hl.cond(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field])) for field in CONVERT_TO_FLOAT_FIELDS })) # Format VEP annotations to mimic the output of hail.vep ds = ds.annotate(info=ds.info.annotate(CSQ=ds.info.CSQ.map( lambda s: s.replace("%3A", ":").replace("%3B", ";").replace( "%3D", "=").replace("%25", "%").replace("%2C", ",")))) ds = ds.annotate(vep=hl.struct( transcript_consequences=ds.info.CSQ.map(lambda csq_str: hl.bind( lambda csq_values: hl.struct( **{ field: hl.cond(csq_values[index] == "", hl.null(hl.tstr), csq_values[index]) for index, field in enumerate(VEP_FIELDS) }), csq_str.split("\\|"), )).filter(lambda annotation: annotation.Feature.startswith("ENST")). filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index). map(lambda annotation: annotation.select( amino_acids=annotation.Amino_acids, biotype=annotation.BIOTYPE, canonical=annotation.CANONICAL == "YES", # cDNA_position may contain either "start-end" or, when start == end, "start" cdna_start=split_position_start(annotation.cDNA_position), cdna_end=split_position_end(annotation.cDNA_position), codons=annotation.Codons, consequence_terms=annotation.Consequence.split("&"), distance=hl.int(annotation.DISTANCE), domains=hl.or_missing( hl.is_defined(annotation.DOMAINS), annotation.DOMAINS.split("&").map(lambda d: hl.struct( db=d.split(":")[0], name=d.split(":")[1])), ), exon=annotation.EXON, gene_id=annotation.Gene, gene_symbol=annotation.SYMBOL, gene_symbol_source=annotation.SYMBOL_SOURCE, hgnc_id=annotation.HGNC_ID, hgvsc=annotation.HGVSc, hgvsp=annotation.HGVSp, lof=annotation.LoF, lof_filter=annotation.LoF_filter, lof_flags=annotation.LoF_flags, lof_info=annotation.LoF_info, # PolyPhen field contains "polyphen_prediction(polyphen_score)" polyphen_prediction=hl.or_missing( hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split("\\(")[0]), protein_id=annotation.ENSP, # Protein_position may contain either "start-end" or, when start == end, "start" protein_start=split_position_start(annotation.Protein_position), protein_end=split_position_end(annotation.Protein_position), # SIFT field contains "sift_prediction(sift_score)" sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split("\\(")[0]), transcript_id=annotation.Feature, )))) ds = ds.annotate(vep=ds.vep.annotate(most_severe_consequence=hl.bind( lambda all_consequence_terms: hl.or_missing( all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]), ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms), ))) ds = ds.cache() print("\n=== Adding derived fields ===") ds = ds.annotate( sorted_transcript_consequences=sorted_transcript_consequences_v3( ds.vep)) ds = ds.select( "filters", "qual", "rsid", "sorted_transcript_consequences", AC=ds.info.AC, AC_Adj=ds.info.AC_Adj, AC_Hemi=ds.info.AC_Hemi, AC_Hom=ds.info.AC_Hom, AF=ds.info.AF, AN=ds.info.AN, AN_Adj=ds.info.AN_Adj, BaseQRankSum=ds.info.BaseQRankSum, CCC=ds.info.CCC, ClippingRankSum=ds.info.ClippingRankSum, DB=ds.info.DB, DP=ds.info.DP, DS=ds.info.DS, END=ds.info.END, FS=ds.info.FS, GQ_MEAN=ds.info.GQ_MEAN, GQ_STDDEV=ds.info.GQ_STDDEV, HWP=ds.info.HWP, HaplotypeScore=ds.info.HaplotypeScore, InbreedingCoeff=ds.info.InbreedingCoeff, MLEAC=ds.info.MLEAC, MLEAF=ds.info.MLEAF, MQ=ds.info.MQ, MQ0=ds.info.MQ0, MQRankSum=ds.info.MQRankSum, NCC=ds.info.NCC, NEGATIVE_TRAIN_SITE=ds.info.NEGATIVE_TRAIN_SITE, POSITIVE_TRAIN_SITE=ds.info.POSITIVE_TRAIN_SITE, QD=ds.info.QD, ReadPosRankSum=ds.info.ReadPosRankSum, VQSLOD=ds.info.VQSLOD, culprit=ds.info.culprit, DP_HIST=ds.info.DP_HIST, GQ_HIST=ds.info.GQ_HIST, DOUBLETON_DIST=ds.info.DOUBLETON_DIST, AC_CONSANGUINEOUS=ds.info.AC_CONSANGUINEOUS, AN_CONSANGUINEOUS=ds.info.AN_CONSANGUINEOUS, Hom_CONSANGUINEOUS=ds.info.Hom_CONSANGUINEOUS, AGE_HISTOGRAM_HET=ds.info.AGE_HISTOGRAM_HET, AGE_HISTOGRAM_HOM=ds.info.AGE_HISTOGRAM_HOM, AC_POPMAX=ds.info.AC_POPMAX, AN_POPMAX=ds.info.AN_POPMAX, POPMAX=ds.info.POPMAX, K1_RUN=ds.info.K1_RUN, K2_RUN=ds.info.K2_RUN, K3_RUN=ds.info.K3_RUN, ESP_AF_POPMAX=ds.info.ESP_AF_POPMAX, ESP_AF_GLOBAL=ds.info.ESP_AF_GLOBAL, ESP_AC=ds.info.ESP_AC, KG_AF_POPMAX=ds.info.KG_AF_POPMAX, KG_AF_GLOBAL=ds.info.KG_AF_GLOBAL, KG_AC=ds.info.KG_AC, AC_FEMALE=ds.info.AC_FEMALE, AN_FEMALE=ds.info.AN_FEMALE, AC_MALE=ds.info.AC_MALE, AN_MALE=ds.info.AN_MALE, populations=hl.struct( **{ pop_id: hl.struct( AC=ds.info[f"AC_{pop_id}"], AN=ds.info[f"AN_{pop_id}"], hemi=ds.info[f"Hemi_{pop_id}"], hom=ds.info[f"Hom_{pop_id}"], ) for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"] }), colocated_variants=hl.bind( lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles). filter(lambda v_id: v_id != this_variant_id), variant_id(ds.locus, ds.alleles), ), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus), ) print("\n=== Writing table ===") ds.write(args.output_url)
def prepare_variant_results(results_url, annotations_url): variant_annotations = hl.import_table( annotations_url, force_bgz=True, min_partitions=100, key="Variant ID", find_replace=(r"^([\dXY]+):(\d+):([ACTG]+):([ACTG]+)", "$1-$2-$3-$4"), missing="NA", types={ "Variant ID": hl.tstr, "CADD": hl.tfloat, "Comment": hl.tstr, "Consequence (canonical)": hl.tstr, "Consequence (for analysis)": hl.tstr, "Consequence (worst)": hl.tstr, "Flags": hl.tstr, "Gene ID": hl.tstr, "Gene name": hl.tstr, "HGVSc (canonical)": hl.tstr, "HGVSc": hl.tstr, "HGVSp (canonical)": hl.tstr, "HGVSp": hl.tstr, "In analysis": hl.tbool, "MPC": hl.tfloat, "Polyphen": hl.tstr, "Source": hl.tstr, "Transcript ID (canonical)": hl.tstr, "Transcript ID(s)": hl.tstr, }, ) variant_annotations = variant_annotations.rename({ "Variant ID": "variant_id", "CADD": "cadd", "Comment": "comment", "Consequence (canonical)": "csq_canonical", "Consequence (for analysis)": "csq_analysis", "Consequence (worst)": "csq_worst", "Flags": "flags", "Gene ID": "gene_id", "Gene name": "gene_name", "HGVSc (canonical)": "hgvsc_canonical", "HGVSc": "hgvsc", "HGVSp (canonical)": "hgvsp_canonical", "HGVSp": "hgvsp", "In analysis": "in_analysis", "MPC": "mpc", "Polyphen": "polyphen", "Source": "source", "Transcript ID (canonical)": "canonical_transcript_id", "Transcript ID(s)": "transcript_id", }) variant_results = hl.import_table( results_url, force_bgz=True, min_partitions=100, key="Variant ID", find_replace=(r"^([\dXY]+):(\d+):([ACTG]+):([ACTG]+)", "$1-$2-$3-$4"), missing="NA", types={ "Variant ID": hl.tstr, "AC case": hl.tint, "AC control": hl.tint, "AF case": hl.tfloat, "AF control": hl.tfloat, "AN case": hl.tint, "AN control": hl.tint, "Analysis group": hl.tstr, "Estimate": hl.tfloat, "I2": hl.tfloat, "N denovos": hl.tint, "P-value": hl.tfloat, "Qp": hl.tfloat, "SE": hl.tfloat, }, ) variant_results = variant_results.rename( { "Variant ID": "variant_id", "AC case": "ac_case", "AC control": "ac_ctrl", "AF case": "af_case", "AF control": "af_ctrl", "AN case": "an_case", "AN control": "an_ctrl", "Analysis group": "analysis_group", "Estimate": "est", "I2": "i2", "N denovos": "n_denovos", "P-value": "p", "Qp": "qp", "SE": "se", }, ) # Rename "EE" analysis group to "DEE" variant_results = variant_results.annotate( analysis_group=hl.cond(variant_results.analysis_group == "EE", "DEE", variant_results.analysis_group)) variants = variant_annotations.annotate(groups=hl.struct()) analysis_groups = variant_results.aggregate( hl.agg.collect_as_set(variant_results.analysis_group)) for group in analysis_groups: group_results = variant_results.filter( variant_results.analysis_group == group).drop("analysis_group") variants = variants.annotate(groups=variants.groups.annotate( **{group: group_results[variants.variant_id]})) variants = variants.annotate( chrom=variants.variant_id.split("-")[0], pos=hl.int(variants.variant_id.split("-")[1]), ) variants = variants.annotate( xpos=x_position(hl.locus(variants.chrom, variants.pos))) return variants
def prepare_variant_results(table_urls): annotations = None analysis_groups = [] for annotations_table_url, results_table_url in table_urls: group_annotations = hl.import_table( annotations_table_url, force=True, key="v", missing="NA", types={ "v": hl.tstr, "in_analysis": hl.tbool, "gene_id": hl.tstr, "gene_name": hl.tstr, "transcript_id": hl.tstr, "hgvsc": hl.tstr, "hgvsp": hl.tstr, "csq_analysis": hl.tstr, "csq_worst": hl.tstr, "mpc": hl.tfloat, "polyphen": hl.tstr, }, ) group_results = hl.import_table( results_table_url, force=True, key="v", missing="NA", types={ "v": hl.tstr, "analysis_group": hl.tstr, "ac_case": hl.tint, "an_case": hl.tstr, "af_case": hl.tstr, "ac_ctrl": hl.tint, "an_ctrl": hl.tstr, "af_ctrl": hl.tstr, }, ) groups_in_table = group_results.aggregate( hl.agg.collect_as_set(group_results.analysis_group)) assert len(groups_in_table) == 1, groups_in_table group_name = groups_in_table.pop() analysis_groups.append(group_name) group_results = group_results.annotate( an_case=hl.int(group_results.an_case), af_case=hl.float(group_results.af_case), an_ctrl=hl.int(group_results.an_ctrl), af_ctrl=hl.float(group_results.af_ctrl), in_analysis=group_annotations[group_results.v].in_analysis, ) group_results.drop("analysis_group").write(f"temp_{group_name}.ht") group_annotations = group_annotations.drop("in_analysis") if annotations is None: annotations = group_annotations else: annotations = annotations.union(group_annotations) annotations = annotations.distinct() annotations = annotations.annotate( filters="PASS", csq_analysis=hl.sorted(annotations.csq_analysis.split(","), lambda c: consequence_term_rank(c))[0], csq_worst=hl.sorted(annotations.csq_worst.split(","), lambda c: consequence_term_rank(c))[0], canonical_transcript_id=annotations.transcript_id, hgvsc_canonical=annotations.hgvsc, hgvsp_canonical=annotations.hgvsp, ) annotations = annotations.annotate( locus=hl.locus( annotations.v.split(":")[0], hl.int(annotations.v.split(":")[1])), alleles=annotations.v.split(":")[2:4], ) annotations = annotations.annotate( variant_id=variant_id(annotations.locus, annotations.alleles), chrom=annotations.locus.contig, pos=annotations.locus.position, xpos=x_position(annotations.locus), alt=annotations.alleles[1], ref=annotations.alleles[0], ) annotations = annotations.drop("locus", "alleles") annotations = annotations.annotate(groups=hl.struct()) for group_name in analysis_groups: results = hl.read_table(f"temp_{group_name}.ht") annotations = annotations.annotate(groups=annotations.groups.annotate( **{group_name: results[annotations.key]})) annotations = annotations.key_by().drop("v") return annotations
def format_variants_table(ds): g = hl.eval(ds.globals) ############################ # Derived top level fields # ############################ ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)) ds = ds.annotate(multiallelic_variants=variant_ids( ds.old_locus, ds.old_alleles).filter(lambda vid: vid != ds.variant_id)) ############### # Frequencies # ############### freq_index_tree = get_freq_index_tree(g.freq_meta) ds = ds.annotate(freq=array_to_tree(ds.freq, freq_index_tree)) ############################## # Filtering allele frequency # ############################## faf_index_tree = get_faf_index_tree(g.faf_index_dict) ds = ds.annotate(faf=array_to_tree( ds.faf, faf_index_tree, lambda faf: faf.select("faf95", "faf99"))) ############## # Histograms # ############## # Convert lists of numbers in histograms into pipe delimited strings ds = ds.annotate( **{ field: ds[field].annotate( bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|")) for field in [ "ab_hist_alt", "dp_hist_all", "dp_hist_alt", "gq_hist_all", "gq_hist_alt" ] }) ########################### # Quality metrics / flags # ########################### # These fields are nested under `info` # # AS_VQSLOD # culprit # DP # FS # InbreedingCoeff # MQ # MQ_DP # MQRankSum # NEGATIVE_TRAIN_SITE # POSITIVE_TRAIN_SITE # QD # QUALapprox # RAW_MQ # ReadPosRankSum # SB # SOR # VarDP # Remove NaN values ds = ds.annotate(info=ds.info.annotate(FS=nullify_nan(ds.info.FS), InbreedingCoeff=nullify_nan( ds.info.InbreedingCoeff), MQ=nullify_nan(ds.info.MQ))) ################### # VEP annotations # ################### ds = ds.annotate( sorted_transcript_consequences=sorted_transcript_consequences_v3( ds.vep)) ds = ds.drop("vep") ################ # Other fields # ################ # These fields are left unaltered at the top level # # decoy # filters # info # lcr # nonpar # popmax # qual # rsid # Drop fields created by splitting multi-allelic variants # This information is captured in the multiallelic_variants derived field ds = ds.drop("a_index", "old_locus", "old_alleles", "was_split") # Internal only # TODO: Remove line, this field won't be in the final table if "project_max" in ds.row_value.dtype.fields: ds = ds.drop("project_max") return ds