Exemplo n.º 1
0
def prepare_coverage(coverage_path):
    coverage = hl.read_table(coverage_path)

    coverage = coverage.annotate(xpos=x_position(coverage.locus))

    # Median field name is different in v3.0.1 vs v2
    if "median" not in coverage.row.dtype.fields:
        coverage = coverage.annotate(median=coverage.median_approx)

    # Drop extra fields in v3
    coverage = coverage.select(
        "xpos",
        "mean",
        "median",
        "over_1",
        "over_5",
        "over_10",
        "over_15",
        "over_20",
        "over_25",
        "over_30",
        "over_50",
        "over_100",
    )

    return coverage
Exemplo n.º 2
0
def prepare_mitochondrial_coverage(coverage_path):
    coverage = hl.read_table(coverage_path)

    coverage = coverage.annotate(xpos=x_position(coverage.locus))

    coverage = coverage.select("xpos", "mean", "median", "over_100",
                               "over_1000")

    return coverage
def prepare_gnomad_v2_variants(exome_variants_path, genome_variants_path):
    exome_variants = prepare_gnomad_v2_variants_helper(exome_variants_path, "exome")
    genome_variants = prepare_gnomad_v2_variants_helper(genome_variants_path, "genome")

    shared_fields = [
        "lcr",
        "nonpar",
        "rsid",
        "segdup",
        "vep",
    ]

    variants = exome_variants.join(genome_variants, "outer")

    variants = variants.annotate(
        **{field: hl.or_else(variants.exome[field], variants.genome[field]) for field in shared_fields}
    )

    variants = variants.annotate(exome=variants.exome.drop(*shared_fields), genome=variants.genome.drop(*shared_fields))

    variants = variants.annotate(
        variant_id=variant_id(variants.locus, variants.alleles),
        reference_genome="GRCh37",
        chrom=normalized_contig(variants.locus.contig),
        pos=variants.locus.position,
        xpos=x_position(variants.locus),
        ref=variants.alleles[0],
        alt=variants.alleles[1],
    )

    variants = variants.transmute(rsids=hl.or_missing(hl.is_defined(variants.rsid), hl.set([variants.rsid])))

    # Variant is in a subset if it is in the subset in either exome or genome samples
    variants = variants.annotate(subsets=variants.exome.subsets.union(variants.genome.subsets))

    # Flags
    variants = variants.annotate(
        flags=hl.set(
            [
                hl.or_missing(variants.lcr, "lcr"),
                hl.or_missing(((variants.chrom == "X") | (variants.chrom == "Y")) & ~variants.nonpar, "par"),
            ]
        ).filter(hl.is_defined)
    )

    # Colocated variants
    variants = variants.cache()
    variants_by_locus = variants.select(
        variants.variant_id,
        exome_ac_raw=hl.struct(**{f: variants.exome.freq[f].ac_raw for f in variants.exome.freq.dtype.fields}),
        genome_ac_raw=hl.struct(
            non_cancer=variants.genome.freq.gnomad.ac_raw,
            **{f: variants.genome.freq[f].ac_raw for f in variants.genome.freq.dtype.fields},
        ),
    )
    variants_by_locus = variants_by_locus.group_by("locus").aggregate(
        variants=hl.agg.collect(variants_by_locus.row_value)
    )

    def subset_filter(subset):
        return lambda variant: (variant.exome_ac_raw[subset] > 0) | (variant.genome_ac_raw[subset] > 0)

    variants_by_locus = variants_by_locus.annotate(
        variant_ids=hl.struct(
            **{
                subset: variants_by_locus.variants.filter(subset_filter(subset)).map(lambda variant: variant.variant_id)
                for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]
            }
        )
    )

    variants = variants.annotate(colocated_variants=variants_by_locus[variants.locus].variant_ids)
    variants = variants.annotate(
        colocated_variants=hl.struct(
            **{
                subset: variants.colocated_variants[subset].filter(lambda variant_id: variant_id != variants.variant_id)
                for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]
            }
        )
    )

    return variants
Exemplo n.º 4
0
def import_exac_coverage():
    paths = [
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr1.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr10.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr11.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr12.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr13.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr14.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr15.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr16.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr17.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr18.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr19.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr2.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr20.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr21.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr22.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr3.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr4.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr5.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr6.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr7.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr8.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr9.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chrX.coverage.txt.gz",
        "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chrY.coverage.txt.gz",
    ]

    column_types = {
        "#chrom": hl.tstr,
        "pos": hl.tint,
        "mean": hl.tfloat,
        "median": hl.tfloat,
        "1": hl.tfloat,
        "5": hl.tfloat,
        "10": hl.tfloat,
        "15": hl.tfloat,
        "20": hl.tfloat,
        "25": hl.tfloat,
        "30": hl.tfloat,
        "50": hl.tfloat,
        "100": hl.tfloat,
    }

    ds = hl.import_table(paths, types=column_types, force_bgz=True)
    ds = ds.rename({
        "#chrom": "chrom",
        "1": "over_1",
        "5": "over_5",
        "10": "over_10",
        "15": "over_15",
        "20": "over_20",
        "25": "over_25",
        "30": "over_30",
        "50": "over_50",
        "100": "over_100",
    })

    ds = ds.transmute(
        locus=hl.locus(ds.chrom, ds.pos, reference_genome="GRCh37"))

    ds = ds.key_by(ds.locus)

    ds = ds.annotate(xpos=x_position(ds.locus))

    ds = ds.repartition(1000, shuffle=True)

    return ds
Exemplo n.º 5
0
def prepare_clinvar_variants(vcf_path, reference_genome):
    ds = import_clinvar_vcf(vcf_path, reference_genome)

    # There are some variants with only one entry in alleles, ignore them for now.
    # These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    ds = hl.vep(ds)

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus.contig), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
Exemplo n.º 6
0
def import_exac_vcf(path):
    ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows()

    ds = hl.split_multi(ds)

    ds = ds.repartition(5000, shuffle=True)

    # Get value corresponding to the split variant
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1])
                for field in PER_ALLELE_FIELDS
            }
        )
    )

    # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals,
    # which is the same in each alt allele's variant.
    ds = ds.annotate(
        info=ds.info.annotate(
            DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]),
            GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]),
        )
    )

    ds = ds.cache()

    # Convert "NA" and empty strings into null values
    # Convert fields in chunks to avoid "Method code too large" errors
    for i in range(0, len(SELECT_INFO_FIELDS), 10):
        ds = ds.annotate(
            info=ds.info.annotate(
                **{
                    field: hl.or_missing(
                        hl.is_defined(ds.info[field]),
                        hl.if_else(
                            (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"),
                            hl.null(ds.info[field].dtype),
                            ds.info[field],
                        ),
                    )
                    for field in SELECT_INFO_FIELDS[i : i + 10]
                }
            )
        )

    # Convert field types
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field]))
                for field in CONVERT_TO_INT_FIELDS
            }
        )
    )
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field]))
                for field in CONVERT_TO_FLOAT_FIELDS
            }
        )
    )

    # Format VEP annotations to mimic the output of hail.vep
    ds = ds.annotate(
        info=ds.info.annotate(
            CSQ=ds.info.CSQ.map(
                lambda s: s.replace("%3A", ":")
                .replace("%3B", ";")
                .replace("%3D", "=")
                .replace("%25", "%")
                .replace("%2C", ",")
            )
        )
    )
    ds = ds.annotate(
        vep=hl.struct(
            transcript_consequences=ds.info.CSQ.map(
                lambda csq_str: hl.bind(
                    lambda csq_values: hl.struct(
                        **{
                            field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index])
                            for index, field in enumerate(VEP_FIELDS)
                        }
                    ),
                    csq_str.split(r"\|"),
                )
            )
            .filter(lambda annotation: annotation.Feature.startswith("ENST"))
            .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index)
            .map(
                lambda annotation: annotation.select(
                    amino_acids=annotation.Amino_acids,
                    biotype=annotation.BIOTYPE,
                    canonical=annotation.CANONICAL == "YES",
                    # cDNA_position may contain either "start-end" or, when start == end, "start"
                    cdna_start=split_position_start(annotation.cDNA_position),
                    cdna_end=split_position_end(annotation.cDNA_position),
                    codons=annotation.Codons,
                    consequence_terms=annotation.Consequence.split("&"),
                    distance=hl.int(annotation.DISTANCE),
                    domains=hl.or_missing(
                        hl.is_defined(annotation.DOMAINS),
                        annotation.DOMAINS.split("&").map(
                            lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1])
                        ),
                    ),
                    exon=annotation.EXON,
                    gene_id=annotation.Gene,
                    gene_symbol=annotation.SYMBOL,
                    gene_symbol_source=annotation.SYMBOL_SOURCE,
                    hgnc_id=annotation.HGNC_ID,
                    hgvsc=annotation.HGVSc,
                    hgvsp=annotation.HGVSp,
                    lof=annotation.LoF,
                    lof_filter=annotation.LoF_filter,
                    lof_flags=annotation.LoF_flags,
                    lof_info=annotation.LoF_info,
                    # PolyPhen field contains "polyphen_prediction(polyphen_score)"
                    polyphen_prediction=hl.or_missing(
                        hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0]
                    ),
                    protein_id=annotation.ENSP,
                    # Protein_position may contain either "start-end" or, when start == end, "start"
                    protein_start=split_position_start(annotation.Protein_position),
                    protein_end=split_position_end(annotation.Protein_position),
                    # SIFT field contains "sift_prediction(sift_score)"
                    sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]),
                    transcript_id=annotation.Feature,
                )
            )
        )
    )

    ds = ds.annotate(
        vep=ds.vep.annotate(
            most_severe_consequence=hl.bind(
                lambda all_consequence_terms: hl.or_missing(
                    all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]
                ),
                ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms),
            )
        )
    )

    ds = ds.cache()

    QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)]

    ds = ds.select(
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome="GRCh37",
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        exome=hl.struct(
            ac=ds.info.AC_Adj,
            an=ds.info.AN_Adj,
            homozygote_count=ds.info.AC_Hom,
            hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0),
            filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)),
            populations=[
                hl.struct(
                    id=pop_id,
                    ac=ds.info[f"AC_{pop_id}"],
                    an=ds.info[f"AN_{pop_id}"],
                    hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0),
                    homozygote_count=ds.info[f"Hom_{pop_id}"],
                )
                for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
            ],
            age_distribution=hl.struct(
                het=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
                hom=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
            ),
            quality_metrics=hl.struct(
                genotype_depth=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                genotype_quality=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                site_quality_metrics=[
                    hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)),
                    hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)),
                    hl.struct(metric="DP", value=hl.float(ds.info.DP)),
                    hl.struct(metric="FS", value=hl.float(ds.info.FS)),
                    hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)),
                    hl.struct(metric="MQ", value=hl.float(ds.info.MQ)),
                    hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)),
                    hl.struct(metric="QD", value=hl.float(ds.info.QD)),
                    hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)),
                    hl.struct(metric="SiteQuality", value=hl.float(ds.qual)),
                    hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)),
                ],
            ),
        ),
        colocated_variants=hl.rbind(
            variant_id(ds.locus, ds.alleles),
            lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter(
                lambda v_id: v_id != this_variant_id
            ),
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(genome=hl.null(ds.exome.dtype))

    return ds
Exemplo n.º 7
0
def import_mnv_file(path, **kwargs):
    column_types = {
        "AC_mnv_ex": hl.tint,
        "AC_mnv_gen": hl.tint,
        "AC_mnv": hl.tint,
        "AC_snp1_ex": hl.tint,
        "AC_snp1_gen": hl.tint,
        "AC_snp1": hl.tint,
        "AC_snp2_ex": hl.tint,
        "AC_snp2_gen": hl.tint,
        "AC_snp2": hl.tint,
        "AN_snp1_ex": hl.tfloat,
        "AN_snp1_gen": hl.tfloat,
        "AN_snp2_ex": hl.tfloat,
        "AN_snp2_gen": hl.tfloat,
        "categ": hl.tstr,
        "filter_snp1_ex": hl.tarray(hl.tstr),
        "filter_snp1_gen": hl.tarray(hl.tstr),
        "filter_snp2_ex": hl.tarray(hl.tstr),
        "filter_snp2_gen": hl.tarray(hl.tstr),
        "gene_id": hl.tstr,
        "gene_name": hl.tstr,
        "locus.contig": hl.tstr,
        "locus.position": hl.tint,
        "mnv_amino_acids": hl.tstr,
        "mnv_codons": hl.tstr,
        "mnv_consequence": hl.tstr,
        "mnv_lof": hl.tstr,
        "mnv": hl.tstr,
        "n_homhom_ex": hl.tint,
        "n_homhom_gen": hl.tint,
        "n_homhom": hl.tint,
        "n_indv_ex": hl.tint,
        "n_indv_gen": hl.tint,
        "n_indv": hl.tint,
        "snp1_amino_acids": hl.tstr,
        "snp1_codons": hl.tstr,
        "snp1_consequence": hl.tstr,
        "snp1_lof": hl.tstr,
        "snp1": hl.tstr,
        "snp2_amino_acids": hl.tstr,
        "snp2_codons": hl.tstr,
        "snp2_consequence": hl.tstr,
        "snp2_lof": hl.tstr,
        "snp2": hl.tstr,
        "transcript_id": hl.tstr,
    }

    ds = hl.import_table(path, key="mnv", missing="", types=column_types, **kwargs)

    ds = ds.rename({"mnv": "variant_id"})

    ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"]))

    ds = ds.transmute(chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus),)

    ds = ds.annotate(ref=ds.variant_id.split("-")[2], alt=ds.variant_id.split("-")[3])

    ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2)
    ds = ds.transmute(
        constituent_snvs=[
            hl.bind(
                lambda variant_id_parts: hl.struct(
                    variant_id=ds[f"{snp}_copy"],
                    chrom=variant_id_parts[0],
                    pos=hl.int(variant_id_parts[1]),
                    ref=variant_id_parts[2],
                    alt=variant_id_parts[3],
                    exome=hl.or_missing(
                        hl.is_defined(ds[f"AN_{snp}_ex"]),
                        hl.struct(
                            filters=ds[f"filter_{snp}_ex"], ac=ds[f"AC_{snp}_ex"], an=hl.int(ds[f"AN_{snp}_ex"]),
                        ),
                    ),
                    genome=hl.or_missing(
                        hl.is_defined(ds[f"AN_{snp}_gen"]),
                        hl.struct(
                            filters=ds[f"filter_{snp}_gen"], ac=ds[f"AC_{snp}_gen"], an=hl.int(ds[f"AN_{snp}_gen"]),
                        ),
                    ),
                ),
                ds[f"{snp}_copy"].split("-"),
            )
            for snp in ["snp1", "snp2"]
        ]
    )

    ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2])

    ds = ds.annotate(
        mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)),
        mnv_in_genome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.genome)),
    )

    ds = ds.transmute(
        n_individuals=ds.n_indv,
        ac=ds.AC_mnv,
        ac_hom=ds.n_homhom,
        exome=hl.or_missing(
            ds.mnv_in_exome, hl.struct(n_individuals=ds.n_indv_ex, ac=ds.AC_mnv_ex, ac_hom=ds.n_homhom_ex),
        ),
        genome=hl.or_missing(
            ds.mnv_in_genome, hl.struct(n_individuals=ds.n_indv_gen, ac=ds.AC_mnv_gen, ac_hom=ds.n_homhom_gen),
        ),
    )

    ds = ds.drop("AC_snp1", "AC_snp2")

    ds = ds.transmute(
        consequence=hl.struct(
            category=ds.categ,
            gene_id=ds.gene_id,
            gene_name=ds.gene_name,
            transcript_id=ds.transcript_id,
            consequence=ds.mnv_consequence,
            codons=ds.mnv_codons,
            amino_acids=ds.mnv_amino_acids,
            lof=ds.mnv_lof,
            snv_consequences=[
                hl.struct(
                    variant_id=ds[f"{snp}"],
                    amino_acids=ds[f"{snp}_amino_acids"],
                    codons=ds[f"{snp}_codons"],
                    consequence=ds[f"{snp}_consequence"],
                    lof=ds[f"{snp}_lof"],
                )
                for snp in ["snp1", "snp2"]
            ],
        )
    )

    # Collapse table to one row per MNV, with all consequences for the MNV collected into an array
    consequences = ds.group_by(ds.variant_id).aggregate(consequences=hl.agg.collect(ds.consequence))
    ds = ds.drop("consequence")
    ds = ds.distinct()
    ds = ds.join(consequences)

    # Sort consequences by severity
    ds = ds.annotate(consequences=hl.sorted(ds.consequences, key=lambda c: consequence_term_rank(c.consequence),))

    ds = ds.annotate(
        changes_amino_acids_for_snvs=hl.literal([0, 1])
        .filter(
            lambda idx: ds.consequences.any(
                lambda csq: csq.snv_consequences[idx].amino_acids.lower() != csq.amino_acids.lower()
            )
        )
        .map(lambda idx: ds.constituent_snv_ids[idx])
    )

    return ds