Exemplo n.º 1
0
def format_clinvar_variants(ds):
    # There are some variants with only one entry in alleles, ignore them for now.
    # TODO: These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    # When a cluster is started with hailctl dataproc start cluster_name --vep, the init script for the
    # selected version of VEP links the appropriate configuration file to /vep_data/vep-gcloud.json
    ds = hl.vep(ds, "file:///vep_data/vep-gcloud.json", name="vep", block_size=1000)
    ds = ds.annotate(sorted_transcript_consequences=sorted_transcript_consequences_v3(ds.vep))
    ds = ds.drop("vep")

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        sorted_transcript_consequences=ds.sorted_transcript_consequences,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
def format_coverage_table(ds):
    ds = ds.select(
        chrom=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        mean=ds.mean,
        median=ds.median,
        over1=ds.over_1,
        over5=ds.over_5,
        over10=ds.over_10,
        over15=ds.over_15,
        over20=ds.over_20,
        over25=ds.over_25,
        over30=ds.over_30,
        over50=ds.over_50,
        over100=ds.over_100,
    )

    ds = ds.key_by().drop("locus")

    return ds
Exemplo n.º 3
0
def format_variants_table(ds):

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ###############
    # Frequencies #
    ###############

    g = hl.eval(ds.globals)

    freq_index_tree = get_freq_index_tree(g.freq_index_dict)

    subsets = list(freq_index_tree.keys())

    ds = ds.annotate(
        **{
            subset: hl.struct(
                # Adjusted frequencies
                AC_adj=freq_expression(ds, "AC", freq_index_tree[subset]),
                AN_adj=freq_expression(ds, "AN", freq_index_tree[subset]),
                AF_adj=freq_expression(ds, "AF", freq_index_tree[subset]),
                nhomalt_adj=freq_expression(ds, "homozygote_count", freq_index_tree[subset]),
                # Raw frequencies
                AC_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC,
                AN_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AN,
                AF_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AF,
                nhomalt_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].homozygote_count,
                # Popmax
                popmax=ds.popmax[g.popmax_index_dict[subset]].pop,
                AC_popmax=ds.popmax[g.popmax_index_dict[subset]].AC,
                AN_popmax=ds.popmax[g.popmax_index_dict[subset]].AN,
                AF_popmax=ds.popmax[g.popmax_index_dict[subset]].AF,
                nhomalt_popmax=ds.popmax[g.popmax_index_dict[subset]].homozygote_count,
            )
            for subset in subsets
        }
    )

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_index_tree = collections.defaultdict(dict)
    for labels_combo, index in g.faf_index_dict.items():
        labels = labels_combo.split("_")
        # Subset labels contain an _, so rebuild those after splitting them
        if labels[0] == "non":
            labels = ["_".join(labels[0:2])] + labels[2:]

        if len(labels) == 2:
            [subset, pop] = labels
            faf_index_tree[subset][pop] = index
        else:
            assert len(labels) == 1
            subset = labels[0]
            faf_index_tree[subset]["total"] = index

    ds = ds.annotate(
        **{
            subset: ds[subset].annotate(
                faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}),
                faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}),
            )
            for subset in subsets
        }
    )

    ds = ds.drop("freq", "popmax", "faf")

    ##############
    # Histograms #
    ##############

    # Extract overall age distribution
    ds = ds.transmute(
        gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]],
        gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]],
    )

    # Convert lists of numbers in histograms into pipe delimited strings
    ds = ds.annotate(
        **{
            field: ds[field].annotate(
                bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|")
            )
            for field in [
                "ab_hist_alt",
                "dp_hist_all",
                "dp_hist_alt",
                "gq_hist_all",
                "gq_hist_alt",
                "gnomad_age_hist_het",
                "gnomad_age_hist_hom",
            ]
        }
    )

    ###########################
    # Quality metrics / flags #
    ###########################

    # Use the same fields as the VCFs
    # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159
    ds = ds.transmute(
        BaseQRankSum=ds.allele_info.BaseQRankSum,
        ClippingRankSum=ds.allele_info.ClippingRankSum,
        DP=ds.allele_info.DP,
        FS=ds.info_FS,
        InbreedingCoeff=ds.info_InbreedingCoeff,
        MQ=ds.info_MQ,
        MQRankSum=ds.info_MQRankSum,
        QD=ds.info_QD,
        ReadPosRankSum=ds.info_ReadPosRankSum,
        rf_negative_label=ds.fail_hard_filters,
        rf_positive_label=ds.tp,
        rf_tp_probability=ds.rf_probability,
        SOR=ds.info_SOR,
        VQSLOD=ds.allele_info.VQSLOD,
        VQSR_culprit=ds.allele_info.culprit,
        VQSR_NEGATIVE_TRAIN_SITE=ds.info_NEGATIVE_TRAIN_SITE,
        VQSR_POSITIVE_TRAIN_SITE=ds.info_POSITIVE_TRAIN_SITE,
    )

    # These fields are left unaltered at the top level
    #
    # allele_type
    # decoy
    # has_star
    # lcr
    # n_alt_alleles
    # nonpar
    # pab_max
    # rf_label
    # rf_train
    # segdup
    # transmitted_singleton
    # variant_type
    # was_mixed

    # TODO: Remove this, leave these at top level
    ds = ds.transmute(
        allele_info=hl.struct(
            BaseQRankSum=ds.BaseQRankSum,
            ClippingRankSum=ds.ClippingRankSum,
            DP=ds.DP,
            FS=ds.FS,
            InbreedingCoeff=ds.InbreedingCoeff,
            MQ=ds.MQ,
            MQRankSum=ds.MQRankSum,
            QD=ds.QD,
            ReadPosRankSum=ds.ReadPosRankSum,
            SOR=ds.SOR,
            VQSLOD=ds.VQSLOD,
            VQSR_culprit=ds.VQSR_culprit,
            VQSR_NEGATIVE_TRAIN_SITE=ds.VQSR_NEGATIVE_TRAIN_SITE,
            VQSR_POSITIVE_TRAIN_SITE=ds.VQSR_POSITIVE_TRAIN_SITE,
        )
    )

    ###################
    # VEP annotations #
    ###################

    ds = ds.annotate(sortedTranscriptConsequences=sorted_transcript_consequences_v2(ds.vep))

    ds = ds.drop("vep")

    #########
    # Flags #
    #########

    # TODO: Leave these at the top level
    ds = ds.transmute(flags=hl.struct(lcr=ds.lcr, segdup=ds.segdup))

    # TODO: Remove this, these flags are calculated on the fly
    ds = ds.annotate(
        flags=ds.flags.annotate(
            lc_lof=get_expr_for_variant_lc_lof_flag(ds.sortedTranscriptConsequences),
            lof_flag=get_expr_for_variant_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
        sortedTranscriptConsequences=hl.bind(
            lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: ds.sortedTranscriptConsequences.map(
                lambda csq: csq.annotate(
                    flags=hl.struct(
                        lc_lof=get_expr_for_consequence_lc_lof_flag(csq),
                        lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id),
                        lof_flag=get_expr_for_consequence_loftee_flag_flag(csq),
                        lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id),
                        nc_transcript=(csq.category == "lof") & (csq.lof == ""),
                    )
                )
            ),
            get_expr_for_genes_with_lc_lof_flag(ds.sortedTranscriptConsequences),
            get_expr_for_genes_with_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
    )

    #################
    # Unused fields #
    #################

    # These fields were not in the 2.1.1 browser Hail table

    ds = ds.drop(
        "adj_biallelic_rank",
        "adj_biallelic_singleton_rank",
        "adj_rank",
        "adj_singleton_rank",
        "biallelic_rank",
        "biallelic_singleton_rank",
        "info_DP",
        "mills",
        "n_nonref",
        "omni",
        "qd",
        "rank",
        "score",
        "singleton_rank",
        "singleton",
        "was_split",
    )

    # These two fields appear only in the genomes table
    if "_score" in ds.row_value.dtype.fields:
        ds = ds.drop("_score", "_singleton")

    ########
    # Keys #
    ########

    # Drop key fields
    ds = ds.key_by().drop("locus", "alleles")

    return ds
def import_mnv_file(path, **kwargs):
    column_types = {
        "AC_mnv_ex": hl.tint,
        "AC_mnv_gen": hl.tint,
        "AC_mnv": hl.tint,
        "AC_snp1_ex": hl.tint,
        "AC_snp1_gen": hl.tint,
        "AC_snp1": hl.tint,
        "AC_snp2_ex": hl.tint,
        "AC_snp2_gen": hl.tint,
        "AC_snp2": hl.tint,
        "AN_snp1_ex": hl.tfloat,
        "AN_snp1_gen": hl.tfloat,
        "AN_snp2_ex": hl.tfloat,
        "AN_snp2_gen": hl.tfloat,
        "categ": hl.tstr,
        "filter_snp1_ex": hl.tarray(hl.tstr),
        "filter_snp1_gen": hl.tarray(hl.tstr),
        "filter_snp2_ex": hl.tarray(hl.tstr),
        "filter_snp2_gen": hl.tarray(hl.tstr),
        "gene_id": hl.tstr,
        "gene_name": hl.tstr,
        "locus.contig": hl.tstr,
        "locus.position": hl.tint,
        "mnv_amino_acids": hl.tstr,
        "mnv_codons": hl.tstr,
        "mnv_consequence": hl.tstr,
        "mnv_lof": hl.tstr,
        "mnv": hl.tstr,
        "n_homhom_ex": hl.tint,
        "n_homhom_gen": hl.tint,
        "n_homhom": hl.tint,
        "n_indv_ex": hl.tint,
        "n_indv_gen": hl.tint,
        "n_indv": hl.tint,
        "snp1_amino_acids": hl.tstr,
        "snp1_codons": hl.tstr,
        "snp1_consequence": hl.tstr,
        "snp1_lof": hl.tstr,
        "snp1": hl.tstr,
        "snp2_amino_acids": hl.tstr,
        "snp2_codons": hl.tstr,
        "snp2_consequence": hl.tstr,
        "snp2_lof": hl.tstr,
        "snp2": hl.tstr,
        "transcript_id": hl.tstr,
    }

    ds = hl.import_table(path,
                         key="mnv",
                         missing="",
                         types=column_types,
                         **kwargs)

    ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"]))

    ds = ds.transmute(
        contig=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
    )

    ds = ds.annotate(ref=ds.mnv.split("-")[2],
                     alt=ds.mnv.split("-")[3],
                     variant_id=ds.mnv)

    ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2)
    ds = ds.transmute(constituent_snvs=[
        hl.bind(
            lambda variant_id_parts: hl.struct(
                variant_id=ds[f"{snp}_copy"],
                chrom=variant_id_parts[0],
                pos=hl.int(variant_id_parts[1]),
                ref=variant_id_parts[2],
                alt=variant_id_parts[3],
                exome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_ex"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_ex"],
                        ac=ds[f"AC_{snp}_ex"],
                        an=hl.int(ds[f"AN_{snp}_ex"]),
                    ),
                ),
                genome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_gen"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_gen"],
                        ac=ds[f"AC_{snp}_gen"],
                        an=hl.int(ds[f"AN_{snp}_gen"]),
                    ),
                ),
            ),
            ds[f"{snp}_copy"].split("-"),
        ) for snp in ["snp1", "snp2"]
    ])

    ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2])

    ds = ds.annotate(
        mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)),
        mnv_in_genome=ds.constituent_snvs.all(
            lambda s: hl.is_defined(s.genome)),
    )

    ds = ds.transmute(
        n_individuals=ds.n_indv,
        ac=ds.AC_mnv,
        ac_hom=ds.n_homhom,
        exome=hl.or_missing(
            ds.mnv_in_exome,
            hl.struct(n_individuals=ds.n_indv_ex,
                      ac=ds.AC_mnv_ex,
                      ac_hom=ds.n_homhom_ex),
        ),
        genome=hl.or_missing(
            ds.mnv_in_genome,
            hl.struct(n_individuals=ds.n_indv_gen,
                      ac=ds.AC_mnv_gen,
                      ac_hom=ds.n_homhom_gen),
        ),
    )

    ds = ds.drop("AC_snp1", "AC_snp2")

    ds = ds.transmute(consequence=hl.struct(
        category=ds.categ,
        gene_id=ds.gene_id,
        gene_name=ds.gene_name,
        transcript_id=ds.transcript_id,
        consequence=ds.mnv_consequence,
        codons=ds.mnv_codons,
        amino_acids=ds.mnv_amino_acids,
        lof=ds.mnv_lof,
        snv_consequences=[
            hl.struct(
                variant_id=ds[f"{snp}"],
                amino_acids=ds[f"{snp}_amino_acids"],
                codons=ds[f"{snp}_codons"],
                consequence=ds[f"{snp}_consequence"],
                lof=ds[f"{snp}_lof"],
            ) for snp in ["snp1", "snp2"]
        ],
    ))

    # Collapse table to one row per MNV, with all consequences for the MNV collected into an array
    consequences = ds.group_by(
        ds.mnv).aggregate(consequences=hl.agg.collect(ds.consequence))
    ds = ds.drop("consequence")
    ds = ds.distinct()
    ds = ds.join(consequences)

    # Sort consequences by severity
    ds = ds.annotate(consequences=hl.sorted(
        ds.consequences,
        key=lambda c: consequence_term_rank(c.consequence),
    ))

    ds = ds.annotate(changes_amino_acids_for_snvs=hl.literal([0, 1]).filter(
        lambda idx: ds.consequences.any(lambda csq: csq.snv_consequences[
            idx].amino_acids.lower() != csq.amino_acids.lower())).map(
                lambda idx: ds.constituent_snv_ids[idx]))

    return ds