Exemplo n.º 1
0
def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht
Exemplo n.º 2
0
def add_project_and_family_annotations(ht: hl.Table, seqr_projects: dict,
                                       family_ids: dict) -> hl.Table:
    """
    Add seqr project and family ID annotations to the kinship table.

    :param ht: Hail Table of kinship values
    :param seqr_projects: Dictionary of seqr projects for each sample
    :param family_ids: Dictionary of family ids for each sample
    :return: Table with seqr project and family id annotations added
    """
    # Add annotation for seqr projects of sample i and sample j
    hl_seqr_projects = hl.literal(seqr_projects)
    ht = ht.annotate(
        seqr_proj_i=hl_seqr_projects.get(ht.i),
        seqr_proj_j=hl_seqr_projects.get(ht.j),
    )

    # Add annotation for family ids of sample i and sample j
    hl_family_ids = hl.literal(family_ids)
    ht = ht.annotate(
        fam_id_i=hl_family_ids.get(ht.i),
        fam_id_j=hl_family_ids.get(ht.j),
    )

    return ht
Exemplo n.º 3
0
def get_known_populations(
        ht: hl.Table,
        pop: str):  # TODO: bring data into separate file and load here
    if pop == 'eur' or pop == 'nfe':
        known_pops = hl.literal({
            'ICR1000': 'nwe',  # gb, ICR
            'ICR142': 'nwe',  # gb, ICR
            'C1017': 'seu',  # it, ATVB
            'C1568': 'seu',  # es, Regicor
            'Bulgarian_Trios': 'bgr',  # Bulgarians
            'C533': 'bgr',  # Bulgarians
            'C821': 'bgr',  # Bulgarians
            'C952': 'bgr',  # Bulgarians
            'G89634': 'est',  # Estonians
            'G94980': 'est',  # Estonians
            # 'C1830': 'neu',  # gb, Leicester
            'C1972': 'nwe',  # gb, Tayside region of Scotland
            # 'G94051': 'seu',  # es, "United States and Spain"
            # 'C1708': 'deu',  # Germans?
            'C1508': 'swe',  # Swedes
            'C1509': 'swe',  # Swedes
        })
    elif pop == 'eas':
        known_pops = hl.literal({
            'C1397': 'oea',  # 'twn',  # Taiwanese trios
            'C1443': 'oea',  # 'twn',  # Taiwanese trios
            'C1506': 'oea',  # 'twn',  # Taiwanese trios
            'C1867': 'oea',  # 'twn',  # Taiwanese trios
            'C978': 'oea',  # 'twn',  # Taiwanese trios
            'C774': 'kor',  # Korean T2D project
            'C1982': 'kor',  # Korean
            'C1940': 'oea',  # 'sgp',  # Singapore
            'C1980': 'oea',  # 'hkg',  # Hong Kong
            '1kg_JPT': 'jpn'
        })
    elif pop == 'afr':
        known_pops = hl.literal({
            'C773': 't2d',  # African American T2D
            'C1002': 't2d',  # African American T2D
            'C1567': 'jhs',  # African American JHS
            'C1956': 'biome',  # African American BioMe
            # TODO: Add 1kg populations here
        })
    else:
        raise ValueError('pop must be one of eur, nfe, eas, afr')
    ht = ht.annotate(known_pop=known_pops.get(ht.meta.project_id))
    if pop == 'eur':
        finns = hl.import_table(
            'gs://gnomad/sample_qc/input_meta/source/99percent_finns_plus_AD_IBD_NFID.tsv.bgz',
            impute=True)
        finns = finns.filter(
            finns.percent_finnish > 0.99).key_by('sample_name_in_vcf')
        ht = ht.annotate(
            known_pop=hl.cond(hl.is_defined(finns[ht.s]), 'fin', ht.known_pop))
    return ht
Exemplo n.º 4
0
def add_release_annotations(ht: hl.Table) -> hl.Table:
    """

    :param Table ht: Table containing meta column annotations for the dataset
    :return: Table containing final 'high_quality' and 'release' sample status annotations
    :rtype: Table
    """
    ht = ht.annotate(high_quality=(hl.len(ht.hard_filters) == 0)
                     & (hl.len(ht.pop_platform_filters) == 0))
    return ht.annotate(release=ht.high_quality & (hl.len(ht.perm_filters) == 0)
                       & ~ht.related)
Exemplo n.º 5
0
def check_sex(
    sex_ht: hl.Table,
    output_dir: str,
    output_name: str,
) -> None:
    """
    Compare inferred to given sex and output file with column added for discrepancies.

    Output directory and name here are used to locate the functioning pedigree with given sexes.

    :param sex_ht: Table of inferred sexes for each sample
    :param output_dir: Path to directory to output results
    :param output_name: Output prefix to use for results
    :return: None
    """
    # Read in functioning pedigree with given sexes
    ped_ht = hl.import_table(
        f"{output_dir}/{output_name}_functioning_pedigree.ped")
    ped_ht = ped_ht.key_by(s=ped_ht.Individual_ID).select("Sex")

    ped_ht = ped_ht.annotate(
        given_sex=hl.case().when(ped_ht.Sex == "M", "male").when(
            ped_ht.Sex == "F", "female").default(ped_ht.Sex)).drop("Sex")

    sex_ht = sex_ht.join(ped_ht, how="outer")
    sex_ht = sex_ht.annotate(discrepant_sex=sex_ht.sex != sex_ht.given_sex)
    sex_ht.export(f"{output_dir}/{output_name}_sex_check.txt")
Exemplo n.º 6
0
def annotate_from_dict(ht: hl.Table, dict_field: str,
                       output_filed: str) -> hl.Table:
    """
    Expand an dict field and add new fields.

    :param ht: HailTable
    :param dict_field: The dict field to be expanded
    :param output_filed: The output filed name (annotated as structure)
    :return: Annotated HailTable
    """

    # retrieve dict keys to be annotated as fields
    dict_keys = ht[dict_field].keys().take(1)[0]

    # structure annotation expression
    struct_expr = hl.struct(
        **{
            dict_keys[i]: ht[dict_field].get(dict_keys[i])
            for i in range(len(dict_keys))
        })

    ht = (ht.annotate(_tmp_field_=struct_expr))
    ht = ht.rename({'_tmp_field_': output_filed})

    return ht
Exemplo n.º 7
0
def apply_rf_model(ht: hl.Table,
                   rf_model: pyspark.ml.PipelineModel,
                   features: List[str],
                   label: str,
                   probability_col_name: str = 'rf_probability',
                   prediction_col_name: str = 'rf_prediction') -> hl.Table:
    """
    Applies a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions.

    :param MatrixTable ht: Input HT
    :param PipelineModel rf_model: Random Forest pipeline model
    :param list of str features: List of feature columns in the pipeline. !Should match the model list of features!
    :param str label: Column containing the labels. !Should match the model labels!
    :param str probability_col_name: Name of the column that will store the RF probabilities
    :param str prediction_col_name: Name of the column that will store the RF predictions
    :return: Table with RF columns
    :rtype: Table
    """

    logger.info("Applying RF model.")

    check_ht_fields_for_spark(ht, features + [label])

    index_name = 'rf_idx'
    while index_name in ht.row:
        index_name += '_tmp'
    ht = ht.add_index(name=index_name)

    ht_keys = ht.key
    ht = ht.key_by(index_name)

    df = ht_to_rf_df(ht, features, label, index_name)

    rf_df = rf_model.transform(df)

    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)

    rf_ht = hl.Table.from_spark(
        rf_df.withColumn("probability", to_array(col("probability"))).select(
            [index_name, 'probability', 'predictedLabel'])).persist()

    rf_ht = rf_ht.key_by(index_name)

    ht = ht.annotate(
        **{
            probability_col_name: {
                label: rf_ht[ht[index_name]]["probability"][i]
                for i, label in enumerate(get_labels(rf_model))
            },
            prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"]
        })

    ht = ht.key_by(*ht_keys)
    ht = ht.drop(index_name)

    return ht
def create_quantile_bin_ht(ht: hl.Table,
                           model_id: str,
                           n_bins: int,
                           vqsr: bool = False,
                           overwrite: bool = False) -> None:
    """
    Creates a table with quantile bin annotations added for a RF run and writes it to its correct location in
    annotations.
    :param model_id: Which data/run hash is being created
    :param n_bins: Number of bins to bin the data into
    :param vqsr: Set True is `model_id` refers to a VQSR filtering model
    :param overwrite: Should output files be overwritten if present
    :return: Nothing
    """
    logger.info(f"Annotating {model_id} HT with quantile bins using {n_bins}")

    ht = ht.annotate(
        positive_train_site=ht.tp,
        negative_train_site=ht.fp,
        score=ht.rf_probability["TP"],
    )

    #ht = ht.filter(ht.ac_raw > 0)

    bin_ht = create_binned_ht(ht, n_bins)
    return bin_ht
Exemplo n.º 9
0
def join_tables(ht: hl.Table, exomes: bool) -> hl.Table:
    '''
    Joins seqr variant table to gnomAD table. NOTE code was written assuming most recent gnomAD release is v3

    :param Table ht: Table with variants downloaded from seqr
    :param bool exomes: Whether to join with gnomAD exomes table or genomes table
    :return: seqr variants Table joined with gnomAD table
    :rtype: hl.Table
    '''
    if exomes:
        # read in exomes table
        gnomad_ht = hl.read_table(
            get_gnomad_liftover_data_path('exomes', version='2.1.1'))
        gnomad_ht = gnomad_ht.select('freq', 'popmax')
        gnomad_ht = gnomad_ht.select_globals()
        gnomad_ht = gnomad_ht.transmute(
            gnomad_exomes_AC=gnomad_ht.freq[0].AC,
            gnomad_exomes_AN=gnomad_ht.freq[0].AN,
            gnomad_exomes_popmax_AF=gnomad_ht.popmax[0].AF,
            gnomad_exomes_popmax_pop=gnomad_ht.popmax[0].pop)
        gnomad_ht.describe()
    else:
        # read in genomes table
        gnomad_ht = hl.read_table(
            'gs://gnomad-public/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
        )
        gnomad_ht = gnomad_ht.select('freq')
        gnomad_ht = gnomad_ht.transmute(gnomad_genomes_AC=gnomad_ht.freq[0].AC,
                                        gnomad_genomes_AN=gnomad_ht.freq[0].AN)
        gnomad_ht = gnomad_ht.select_globals()
        gnomad_ht.describe()

    ht = ht.annotate(**gnomad_ht[ht.key])
    ht.describe()
    return ht
Exemplo n.º 10
0
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table:
    """
    Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample.
    An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept.
    Requires a field `filtered` which type should be the same as the input duplicated samples Table key.

    :param dups_ht: Input HT
    :return: Flattened HT
    """
    def get_dups_to_keep_expr():
        if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype:
            return (dups_ht.key, False)
        elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type
                                        == dups_ht.key[0].dtype):
            return (dups_ht.key[0], False)
        else:
            raise TypeError(
                f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible."
            )

    dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend(
        dups_ht.filtered.map(lambda x: (x, True))))
    dups_ht = dups_ht.explode("dups")
    dups_ht = dups_ht.key_by()
    return dups_ht.select(s=dups_ht.dups[0],
                          dup_filtered=dups_ht.dups[1]).key_by("s")
Exemplo n.º 11
0
def add_global_af(ht: hl.Table, temp: str) -> hl.Table:
    '''
    Adds gnomAD global AF annotation to Table

    :param Table ht: Input Table
    :param str temp: Path to temp bucket (to store intermediary files)
    :return: Table with gnomAD global AF annotation
    :rtype: Table
    '''
    # checkpoint table after completing both gnomAD exomes and gnomAD genomes join
    temp_path = f'{temp}/join.ht'
    ht = ht.checkpoint(temp_path)

    # set gnomAD ACs and ANs to 0 if they are missing after the join
    ht = ht.transmute(
        gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC),
                                    ht.gnomad_exomes_AC, 0),
        gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC),
                                     ht.gnomad_genomes_AC, 0),
        gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN),
                                    ht.gnomad_exomes_AN, 0),
        gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN),
                                     ht.gnomad_genomes_AN, 0),
    )

    ht = ht.annotate(gnomad_global_AF=(
        hl.if_else(((ht.gnomad_exomes_AN == 0)
                    & (ht.gnomad_genomes_AN == 0)), 0.0,
                   hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) /
                            (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN)))))
    ht.describe()
    return ht
Exemplo n.º 12
0
def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table:
    """
    Make table with rank of sample sorted by retention priority
    (lower rank has higher priority).
    It mainly uses two bits of information:
      - cases are prioritised over controls
      - samples are preferred based on the cohort info as follow: chd > ddd > ukbb
    :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...)
    :return: Hail Table
    """

    phe_ht = (
        phe_ht.annotate(
            case_control_rank=hl.int(
                phe_ht['phe.is_case']),  # 0: control, 1: cases
            cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when(
                phe_ht.is_ddd, 100).when(phe_ht.is_chd,
                                         1000).or_missing()).key_by())

    phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank'))

    # sort table (descending)
    tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank),
                               hl.desc(phe_ht.cohort_rank)))

    tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id'))

    tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1)

    return tb_rank
Exemplo n.º 13
0
def filter_kin_ht(
    ht: hl.Table,
    out_summary: io.TextIOWrapper,
    first_degree_pi_hat: float = 0.40,
    grandparent_pi_hat: float = 0.20,
    grandparent_ibd1: float = 0.25,
    grandparent_ibd2: float = 0.15,
) -> hl.Table:
    """
    Filter the kinship table to relationships of grandparents and above.

    :param ht: hl.Table
    :param out_summary: Summary file with a summary statistics and notes
    :param first_degree_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to first degree relatives
    :param grandparent_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to grandparents
    :param grandparent_ibd1: Minimum IBD1 threshold to use to filter the kinship table to grandparents
    :param grandparent_ibd2: Maximum IBD2 threshold to use to filter the kinship table to grandparents
    :return: Table containing only relationships of grandparents and above
    """
    # Filter to anything above the relationship of a grandparent
    ht = ht.filter((ht.pi_hat > first_degree_pi_hat)
                   | ((ht.pi_hat > grandparent_pi_hat)
                      & (ht.ibd1 > grandparent_ibd1)
                      & (ht.ibd2 < grandparent_ibd2)))
    ht = ht.annotate(pair=hl.sorted([ht.i, ht.j]))

    out_summary.write(
        f"NOTE: kinship table was filtered to:\n(kin > {first_degree_pi_hat}) or kin > {grandparent_pi_hat} and IBD1 > {grandparent_ibd1} and IBD2 > {grandparent_ibd2})\n"
    )
    out_summary.write(
        f"relationships not meeting this critera were not evaluated\n\n")

    return ht
Exemplo n.º 14
0
def annotate_relatedness(
    relatedness_ht: hl.Table,
    first_degree_kin_thresholds: float = (0.1767767, 0.4),
    second_degree_kin_cutoff: float = 0.1,
    ibd0_0_max: float = 0.05,
) -> hl.Table:
    relatedness_ht = relatedness_ht.annotate(
        relationship=get_relationship_expr(
            kin_expr=relatedness_ht.kin,
            ibd0_expr=relatedness_ht.ibd0,
            ibd1_expr=relatedness_ht.ibd1,
            ibd2_expr=relatedness_ht.ibd2,
            first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
            second_degree_min_kin=second_degree_kin_cutoff,
            ibd0_0_max=ibd0_0_max,
        )
    )
    relatedness_ht = relatedness_ht.annotate_globals(
        min_individual_maf=0.01,
        min_emission_kinship=0.05,
        ibd0_0_max=ibd0_0_max,
        second_degree_kin_cutoff=second_degree_kin_cutoff,
        first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
    )
    return relatedness_ht
Exemplo n.º 15
0
def compute_callrate_mt(
    mt: hl.MatrixTable,
    intervals_ht: hl.Table,
    bi_allelic_only: bool = True,
    autosomes_only: bool = True,
    match: bool = True,
) -> hl.MatrixTable:
    """
    Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

    This can be used as input for imputing exome sequencing platforms.

    .. note::

        The input interval HT should have a key of type Interval.
        The resulting table will have a key of the same type as the `intervals_ht` table and
        contain an `interval_info` field containing all non-key fields of the `intervals_ht`.

    :param mt: Input MT
    :param intervals_ht: Table containing the intervals. This table has to be keyed by locus.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :param autosomes_only: If set, only autosomal intervals are used.
    :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT.
    :return: Callrate MT
    """
    logger.info("Computing call rate MatrixTable")

    if len(intervals_ht.key) != 1 or not isinstance(
            intervals_ht.key[0], hl.expr.IntervalExpression):
        logger.warning(
            "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s",
            intervals_ht.key,
        )

    if autosomes_only:
        callrate_mt = filter_to_autosomes(mt)

    if bi_allelic_only:
        callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt))

    intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key)
    callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index(
        callrate_mt.locus, all_matches=match)._interval_key)

    if match:
        callrate_mt = callrate_mt.explode_rows("_interval_key")

    callrate_mt = callrate_mt.filter_rows(
        hl.is_defined(callrate_mt._interval_key.interval))
    callrate_mt = callrate_mt.select_entries(
        GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct()))
    callrate_mt = callrate_mt.group_rows_by(
        **callrate_mt._interval_key).aggregate(
            callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT)))
    intervals_ht = intervals_ht.drop("_interval_key")
    callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct(
        **intervals_ht[callrate_mt.row_key]))
    return callrate_mt
Exemplo n.º 16
0
def default_generate_sib_stats(
    mt: hl.MatrixTable,
    relatedness_ht: hl.Table,
    sex_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
) -> hl.Table:
    """
    This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants
    shared by pairs of siblings in `relatedness_ht`.

    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined by
    the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of
    samples that are annotated as `SIBLINGS`.

    :param mt: Input Matrix table
    :param relatedness_ht: Input relationship table
    :param sex_ht: A Table containing sex information for the samples
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants.
    :return: A Table with the sibling shared variant counts
    """
    sex_ht = sex_ht.annotate(
        is_female=hl.case()
        .when(sex_ht.sex_karyotype == "XX", True)
        .when(sex_ht.sex_karyotype == "XY", False)
        .or_missing()
    )

    # TODO: Change to use SIBLINGS constant when relatedness PR goes in
    sib_ht = relatedness_ht.filter(relatedness_ht[relationship_col] == "Siblings")
    s_to_keep = sib_ht.aggregate(
        hl.agg.explode(
            lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s]
        ),
        _localize=False,
    )
    mt = mt.filter_cols(s_to_keep.contains(mt.s))
    mt = annotate_adj(mt)

    mt = mt.annotate_cols(is_female=sex_ht[mt.s].is_female)

    sib_stats_ht = mt.select_rows(
        **generate_sib_stats_expr(
            mt,
            sib_ht,
            i_col=i_col,
            j_col=j_col,
            strata={"raw": True, "adj": mt.adj},
            is_female=mt.is_female,
        )
    ).rows()

    return sib_stats_ht
Exemplo n.º 17
0
def compute_stratified_metrics_filter(ht: hl.Table,
                                      qc_metrics: List[str],
                                      strata: List[str] = None) -> hl.Table:
    """
    Compute median, MAD, and upper and lower thresholds for each metric used in pop- and platform-specific outlier filtering

    :param MatrixTable ht: HT containing relevant sample QC metric annotations
    :param list qc_metrics: list of metrics for which to compute the critical values for filtering outliers
    :param list of str strata: List of annotations used for stratification. These metrics should be discrete types!
    :return: Table grouped by pop and platform, with upper and lower threshold values computed for each sample QC metric
    :rtype: Table
    """
    def make_pop_filters_expr(ht: hl.Table,
                              qc_metrics: List[str]) -> hl.expr.SetExpression:
        return hl.set(
            hl.filter(lambda x: hl.is_defined(x), [
                hl.or_missing(ht[f'fail_{metric}'], metric)
                for metric in qc_metrics
            ]))

    ht = ht.select(*strata,
                   **ht.sample_qc.select(*qc_metrics)).key_by('s').persist()

    def get_metric_expr(ht, metric):
        metric_values = hl.agg.collect(ht[metric])
        metric_median = hl.median(metric_values)
        metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median))
        return hl.struct(median=metric_median,
                         mad=metric_mad,
                         upper=metric_median +
                         4 * metric_mad if metric != 'callrate' else 1,
                         lower=metric_median -
                         4 * metric_mad if metric != 'callrate' else 0.99)

    agg_expr = hl.struct(
        **{metric: get_metric_expr(ht, metric)
           for metric in qc_metrics})
    if strata:
        ht = ht.annotate_globals(metrics_stats=ht.aggregate(
            hl.agg.group_by(hl.tuple([ht[x] for x in strata]), agg_expr)))
    else:
        ht = ht.annotate_globals(metrics_stats={(): ht.aggregate(agg_expr)})

    strata_exp = hl.tuple([ht[x] for x in strata]) if strata else hl.tuple([])

    fail_exprs = {
        f'fail_{metric}':
        (ht[metric] >= ht.metrics_stats[strata_exp][metric].upper) |
        (ht[metric] <= ht.metrics_stats[strata_exp][metric].lower)
        for metric in qc_metrics
    }
    ht = ht.transmute(**fail_exprs)
    pop_platform_filters = make_pop_filters_expr(ht, qc_metrics)
    return ht.annotate(pop_platform_filters=pop_platform_filters)
def add_rank(
    ht: hl.Table,
    score_expr: hl.expr.NumericExpression,
    subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None,
) -> hl.Table:
    """
    Adds rank based on the `score_expr`. Rank is added for snvs and indels separately.
    If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true.

    In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`).

    :param ht: input Hail Table containing variants (with QC annotations) to be ranked
    :param score_expr: the Table annotation by which ranking should be scored
    :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr
    :return: Table with rankings added
    """

    key = ht.key
    if subrank_expr is None:
        subrank_expr = {}

    temp_expr = {"_score": score_expr}
    temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()})
    rank_ht = ht.select(
        **temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1]))

    rank_ht = rank_ht.key_by("_score").persist()
    scan_expr = {
        "rank": hl.cond(
            rank_ht.is_snv,
            hl.scan.count_where(rank_ht.is_snv),
            hl.scan.count_where(~rank_ht.is_snv),
        )
    }
    scan_expr.update(
        {
            name: hl.or_missing(
                rank_ht[f"_{name}"],
                hl.cond(
                    rank_ht.is_snv,
                    hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]),
                    hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]),
                ),
            )
            for name in subrank_expr
        }
    )
    rank_ht = rank_ht.annotate(**scan_expr)

    rank_ht = rank_ht.key_by(*key).persist()
    rank_ht = rank_ht.select(*scan_expr.keys())

    ht = ht.annotate(**rank_ht[key])
    return ht
Exemplo n.º 19
0
def densify_sites(
    mt: hl.MatrixTable,
    sites_ht: hl.Table,
    last_END_positions_ht: hl.Table,
    semi_join_rows: bool = True,
) -> hl.MatrixTable:
    """
    Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required.

    Note that only rows that appear both in `mt` and `sites_ht` are returned.

    :param mt: Input sparse MT
    :param sites_ht: Desired sites to densify
    :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag)
    :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites)
    :return: Dense MT filtered to the sites in `sites_ht`
    """
    logger.info("Computing intervals to densify from sites Table.")
    sites_ht = sites_ht.key_by("locus")
    sites_ht = sites_ht.annotate(
        interval=hl.locus_interval(
            sites_ht.locus.contig,
            last_END_positions_ht[sites_ht.key].last_END_position,
            end=sites_ht.locus.position,
            includes_end=True,
            reference_genome=sites_ht.locus.dtype.reference_genome,
        )
    )
    sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval))

    if semi_join_rows:
        mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus]))
    else:
        logger.info("Collecting intervals to densify.")
        intervals = sites_ht.interval.collect()

        print(
            "Found {0} intervals, totalling {1} bp in the dense Matrix.".format(
                len(intervals),
                sum(
                    [
                        interval_length(interval)
                        for interval in union_intervals(intervals)
                    ]
                ),
            )
        )

        mt = hl.filter_intervals(mt, intervals)

    mt = hl.experimental.densify(mt)

    return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
Exemplo n.º 20
0
def compute_fisher_exact(tb: hl.Table,
                         n_cases_col: str,
                         n_control_col: str,
                         total_cases_col: str,
                         total_controls_col: str,
                         correct_total_counts: bool,
                         extra_fields: dict,
                         root_col_name: str = 'fet') -> hl.Table:
    """
    Perform two-sided Fisher Exact test. Add extra annotations (if any)

    :param tb: Hail Table
    :param n_cases_col: field name with number of (affected) cases
    :param n_control_col: field name with number of (affected) control
    :param total_cases_col: field name with total number of cases
    :param total_controls_col: field name with total number of controls
    :param correct_total_counts: should the total numbers (case/control) be corrected to avoid duplicated counting?
    :param root_col_name: field to be annotated with test results
    :param extra_fields: Extra filed (must be a dict) to be annotated
    :return: Hail Table with Fisher Exact test results.
    """
    # compute fisher exact
    if correct_total_counts:
        fet = hl.fisher_exact_test(
            c1=hl.int32(tb[n_cases_col]),
            c2=hl.int32(tb[n_control_col]),
            c3=hl.int32(tb[total_cases_col]) - hl.int32(tb[n_cases_col]),
            c4=hl.int32(tb[total_controls_col]) - hl.int32(tb[n_control_col]))
    else:
        fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]),
                                   c2=hl.int32(tb[n_control_col]),
                                   c3=hl.int32(tb[total_cases_col]),
                                   c4=hl.int32(tb[total_controls_col]))

    tb = (tb.annotate(**{root_col_name: fet}).flatten())

    if len(extra_fields) == 0:
        return tb
    else:
        return tb.annotate(**extra_fields)
Exemplo n.º 21
0
def load_covid_data(all_samples_ht: hl.Table, covid_data_path: str, wave: str = '01'):
    print(f'Loading COVID wave {wave}...')
    covid_ht = hl.import_table(covid_data_path, delimiter='\t', missing='', impute=True, key='eid')
    covid_ht = covid_ht.group_by('eid').aggregate(
        origin=hl.agg.any(covid_ht.origin == 1),
        result=hl.agg.any(covid_ht.result == 1),
        inpatient=hl.agg.any(covid_ht.reqorg == 1),
    )

    # TODO: add aoo parse to separate trait_type (covid_quantitative?)
    # dob = load_dob_ht(pre_phesant_tsv_path)[ht.key].date_of_birth
    # ht = ht.annotate(aoo=hl.or_missing(ht.result == 1, hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT') - dob),
    #                  specdate=hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')).drop('specdate')

    ht = all_samples_ht.annotate(**covid_ht[all_samples_ht.key])
    centers = hl.literal(ENGLAND_RECRUITMENT_CENTERS)

    analyses = {
        'B1_v2': hl.or_missing(ht.result, ht.inpatient),  # fka ANA2
        'B1_v2_origin': hl.or_missing(ht.result, ht.origin),  # fka ANA2
        'C2_v2': hl.or_else(ht.result, False),  # fka ANA5
        'C2_v2_england_controls': hl.or_missing(centers.contains(ht.recruitment_center),  # fka ANA5_england_controls
                                               hl.or_else(ht.result, False)),
        'C1_v2': ht.result,  # fka ANA5_strict
        'B2_v2': hl.or_else(ht.result & ht.inpatient, False),  # fka ANA6
        'B2_v2_origin': hl.or_else(ht.result & ht.origin, False)  # fka ANA6
    }
    analysis_names = {
        'B1_v2': 'Hospitalized vs non-hospitalized (among COVID-19 positive)',  # fka ANA2
        'B1_v2_origin': 'Hospitalized vs non-hospitalized (among COVID-19 positive; old definition using "origin" field)',  # fka ANA2
        'C2_v2': 'COVID-19 positive (controls include untested)',  # fka ANA5
        'C2_v2_england_controls': 'COVID-19 positive (controls include untested), only patients from centers in England',  # fka ANA5_england_controls
        'C1_v2': 'COVID-19 positive (controls only COVID-19 negative)',  # fka ANA5_strict
        'B2_v2': 'Hospitalized vs non-hospitalized (controls include untested)',  # ANA6
        'B2_v2_origin': 'Hospitalized vs non-hospitalized (controls include untested; old definition using "origin" field)'  # ANA6
    }
    assert set(analyses.keys()) == set(analysis_names.keys())

    ht = ht.select(**analyses)
    mt = filter_and_annotate_ukb_data(ht, lambda k, v: True, annotate_with_showcase=False,
                                      format_col_name=lambda x: x)
    mt = mt.key_cols_by(trait_type='categorical', phenocode='COVID19', pheno_sex='both_sexes',
                        coding=mt.phenocode, modifier=wave)
    mt = mt.annotate_cols(description=hl.literal(analysis_names)[mt.coding])

    mt.annotate_cols(
        n_cases=hl.agg.count_where(mt.value == 1.0),
        n_controls=hl.agg.count_where(mt.value == 0.0)
    ).cols().show()

    return mt
Exemplo n.º 22
0
def prepare_ht_export(ht: hl.Table) -> hl.Table:

    subset_list = ['gnomad']

    for subset in subset_list:
        INFO_DICT.update(make_info_dict(subset, dict(group=GROUPS)))
        INFO_DICT.update(make_info_dict(subset, dict(group=GROUPS, pop=POPS)))

    new_info_dict = {
        i.replace('gnomad_', '').replace('_adj', ''): j
        for i, j in INFO_DICT.items()
    }

    ht = ht.annotate(info=hl.struct(**make_info_expr(ht)))
    ht = ht.annotate(info=ht.info.annotate(**unfurl_nested_annotations(ht)))

    #ht = ht.select('info', 'filters', 'rsid', 'qual','vep')
    ht = ht.select('info', 'filters', 'rsid', 'qual')

    header_dict = {'info': new_info_dict}
    #'filter': make_filter_dict(ht)}

    return ht
Exemplo n.º 23
0
def pick_transcript(ht: hl.Table, csq_array: str) -> hl.Table:
    # TODO: This function could be improved by scanning the array (just once) and sorting it as suggested here:
    # TODO: https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/pick.20transcript.20from.20array
    # TODO: /near/190400193
    """
    Annotate an extra field (tx) with the selected transcript.
    This function will pick one transcript per variant/consequence based on the impact of the variant in the transcript
    (from more severe to less severe).

    :param ht: Hail table with VEP annotations
    :param csq_array: Parsed CSQ field name. Expected to be an array of dict(s). Transcript expected to be as a dict.
    :return: Hail table with an annotated extra field (tx). The transcript selected from the array based on a set of
    pre-defined criteria.
    """

    # Set transcript (tx) field initially to 'NA' and update it sequentially based on a set of pre-defined criteria
    # (order matters)
    ht = (ht.annotate(tx=ht[csq_array].find(lambda x: False)))

    # getting current keys from dict
    keys = ht[csq_array].take(1)[0][0]

    # select tx if LoF == 'HC'
    if 'LoF' in keys:
        ht = (ht.annotate(
            tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find(
                lambda x: x['LoF'] == 'HC'), ht.tx)))

    # select transcript based on the consequence impact (high -> moderate -> low)
    if 'IMPACT' in keys:
        # select tx if IMPACT == HIGH
        ht = (ht.annotate(
            tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find(
                lambda x: x['IMPACT'] == 'HIGH'), ht.tx)))
        # select tx if IMPACT == MODERATE
        ht = (ht.annotate(
            tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find(
                lambda x: x['IMPACT'] == 'MODERATE'), ht.tx)))
        # select tx if IMPACT == LOW
        ht = (ht.annotate(
            tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find(
                lambda x: x['IMPACT'] == 'LOW'), ht.tx)))

    # select tx if CANONICAL
    ht = (ht.annotate(
        tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find(
            lambda x: x['CANONICAL'] == 'YES'), ht.tx)))

    # if tx is still missing, set tx as the first annotated transcript
    ht = (ht.annotate(
        tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array][0], ht.tx)))
    return ht
Exemplo n.º 24
0
 def annotate_related_pairs(related_pairs: hl.Table,
                            index_col: str) -> hl.Table:
     related_pairs = related_pairs.key_by(**related_pairs[index_col])
     related_pairs = related_pairs.filter(
         hl.is_missing(case_parents[related_pairs.key]))
     return related_pairs.annotate(
         **{
             index_col:
             related_pairs[index_col].annotate(
                 case_rank=hl.or_else(
                     hl.int(meta_ht[related_pairs.key].is_case), -1),
                 dp_mean=hl.or_else(
                     sample_qc_ht[
                         related_pairs.key].sample_qc.dp_stats.mean, -1.0))
         }).key_by()
Exemplo n.º 25
0
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table:
    """
    Flattens the result of `filter_duplicate_samples`, so that each line contains a single sample.
    An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept.

    :param dups_ht: Input HT
    :return: Flattened HT
    """
    dups_ht = dups_ht.annotate(dups=hl.array([(
        dups_ht.key,
        False)]).extend(dups_ht.filtered.map(lambda x: (x, True))))
    dups_ht = dups_ht.explode('dups')
    dups_ht = dups_ht.key_by()
    return dups_ht.select(s=dups_ht.dups[0],
                          dup_filtered=dups_ht.dups[1]).key_by('s')
Exemplo n.º 26
0
def compute_grouped_binned_ht(
    bin_ht: hl.Table,
    checkpoint_path: Optional[str] = None,
) -> hl.GroupedTable:
    """
    Group a Table that has been annotated with bins (`compute_ranked_bin` or `create_binned_ht`).

    The table will be grouped by bin_id (bin, biallelic, etc.), contig, snv, bi_allelic and singleton.

    .. note::

        If performing an aggregation following this grouping (such as `score_bin_agg`) then the aggregation
        function will need to use `ht._parent` to get the origin Table from the GroupedTable for the aggregation

    :param bin_ht: Input Table with a `bin_id` annotation
    :param checkpoint_path: If provided an intermediate checkpoint table is created with all required annotations before shuffling.
    :return: Table grouped by bins(s)
    """
    # Explode the rank table by bin_id
    bin_ht = bin_ht.annotate(
        bin_groups=hl.array(
            [
                hl.Struct(bin_id=bin_name, bin=bin_ht[bin_name])
                for bin_name in bin_ht.bin_group_variant_counts
            ]
        )
    )
    bin_ht = bin_ht.explode(bin_ht.bin_groups)
    bin_ht = bin_ht.transmute(
        bin_id=bin_ht.bin_groups.bin_id, bin=bin_ht.bin_groups.bin
    )
    bin_ht = bin_ht.filter(hl.is_defined(bin_ht.bin))

    if checkpoint_path is not None:
        bin_ht.checkpoint(checkpoint_path, overwrite=True)
    else:
        bin_ht = bin_ht.persist()

    # Group by bin_id, bin and additional stratification desired and compute QC metrics per bin
    return bin_ht.group_by(
        bin_id=bin_ht.bin_id,
        contig=bin_ht.locus.contig,
        snv=hl.is_snp(bin_ht.alleles[0], bin_ht.alleles[1]),
        bi_allelic=~bin_ht.was_split,
        singleton=bin_ht.singleton,
        release_adj=bin_ht.ac > 0,
        bin=bin_ht.bin,
    )._set_buffer_size(20000)
Exemplo n.º 27
0
def filter_ht_for_plink(ht: hl.Table,
                        n_samples: int,
                        min_call_rate: float = 0.95,
                        variants_per_mac_category: int = 2000,
                        variants_per_maf_category: int = 10000):
    from gnomad.utils.filtering import filter_to_autosomes
    ht = filter_to_autosomes(ht)
    ht = ht.filter((ht.call_stats.AN >= n_samples * 2 * min_call_rate)
                   & (ht.call_stats.AC > 0))
    ht = ht.annotate(mac_category=mac_category_case_builder(ht.call_stats))
    category_counter = ht.aggregate(hl.agg.counter(ht.mac_category))
    print(category_counter)
    ht = ht.annotate_globals(category_counter=category_counter)
    return ht.filter(
        hl.rand_unif(
            0, 1) < hl.cond(ht.mac_category >= 1, variants_per_mac_category,
                            variants_per_maf_category) /
        ht.category_counter[ht.mac_category])
Exemplo n.º 28
0
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable:
    """
    Filters a MatrixTable to a set of trios in `fam_ht`, filters to autosomes, and annotates with adj.

    :param mt: A Matrix Table to filter to only trios
    :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam`
    :return: A MT filtered to trios and adj annotated
    """
    # Filter MT to samples present in any of the trios
    fam_ht = fam_ht.annotate(fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id])
    fam_ht = fam_ht.explode("fam_members", name="s")
    fam_ht = fam_ht.key_by("s").select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)

    return mt
Exemplo n.º 29
0
def collapse_small_pops(ht: hl.Table, min_pop_size: int) -> hl.Table:
    """

    Collapses (sub)populations that are too small for release into others.
    When collapsing subpops, the name for the other category is composed of "o" +  2 first letters of the superpop
    The original RF population assignments are kept in the `rf_pop` and `rf_subpop` columns.

    :param ht: Input Table
    :return: Table with small populations collapsed
    :rtype: Table
    """
    def get_subpop_oth(pop: str):
        for superpop, subpops in SUBPOPS.items():
            if pop.upper() in subpops:
                return "o" + superpop[:2].lower()

        raise ValueError(
            f"Subpopulation {pop} not found in possible subpopulations.")

    ht = ht.persist()
    pop_counts = ht.aggregate(hl.agg.filter(ht.release,
                                            hl.agg.counter(ht.pop)))
    pop_collapse = {
        pop: "oth"
        for pop, n in pop_counts.items() if n < min_pop_size
    }
    pop_collapse = hl.literal(pop_collapse) if pop_collapse else hl.empty_dict(
        hl.tstr, hl.tstr)

    subpop_counts = ht.aggregate(
        hl.agg.filter(ht.release, hl.agg.counter(ht.subpop)))
    subpop_collapse = {
        subpop: get_subpop_oth(subpop)
        for subpop, n in subpop_counts.items() if n < min_pop_size
    }
    subpop_collapse = hl.literal(
        subpop_collapse) if subpop_collapse else hl.empty_dict(
            hl.tstr, hl.tstr)

    return ht.annotate(pop=pop_collapse.get(ht.pop, ht.pop),
                       subpop=subpop_collapse.get(ht.subpop, ht.subpop),
                       rf_pop=ht.pop,
                       rf_subpop=ht.subpop)
Exemplo n.º 30
0
def annotate_from_dict(ht: hl.Table, dict_field: str) -> hl.Table:
    """
    Expand an dict field and add new fields.

    :param ht: HailTable
    :param dict_field: The dict field to be expanded
    :return: Annotated HailTable
    """

    # number of fields to be annotated
    dict_keys = ht[dict_field].keys().take(1)[0]

    # print(dict_keys)

    ht = (ht.annotate(
        **{
            dict_keys[i]: ht[dict_field].get(dict_keys[i])
            for i in range(len(dict_keys))
        }))

    return ht