Пример #1
0
def compute_sample_qc(data_type: str = "genomes") -> hl.Table:
    """
    Perform sample QC on the split VDS table using `compute_stratified_sample_qc`.
    :param data_type: Whether data is from genomes or exomes, default is genomes
    :return: Table containing sample QC metrics
    :rtype: hl.Table
    """
    logger.info("Computing sample QC on CCDG %s VDS", data_type)
    vds = get_qc_vds(
        data_type=data_type,
        autosome_only=True,
        split=True,
        interval_qc=True,
    )
    # Use modified compute_stratified_sample_qc with the vds option
    sample_qc_ht = compute_stratified_sample_qc(
        vds,
        strata={
            "bi_allelic": bi_allelic_expr(vds.variant_data),
            "multi_allelic": ~bi_allelic_expr(vds.variant_data),
        },
        tmp_ht_prefix=get_ccdg_results_path(data_type=data_type, result="sample_qc")[
            :-3
        ],
    )

    return sample_qc_ht.repartition(100)
Пример #2
0
def generate_trio_stats(
    mt: hl.MatrixTable, autosomes_only: bool = True, bi_allelic_only: bool = True
) -> hl.Table:
    """
    Default function to run `generate_trio_stats_expr` to get trio stats stratified by raw and adj
    .. note::
        Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with
        a sparse MT `hl.experimental.densify` must be run first.
        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.
    :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: Table with trio stats
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    logger.info(f"Generating trio stats using {mt.count_cols()} trios.")
    trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj

    ht = mt.select_rows(
        **generate_trio_stats_expr(
            mt,
            transmitted_strata={"raw": True, "adj": trio_adj},
            de_novo_strata={"raw": True, "adj": trio_adj},
            ac_strata={"raw": True, "adj": trio_adj},
        )
    ).rows()

    return ht
Пример #3
0
def generate_sib_stats(
    mt: hl.MatrixTable,
    relatedness_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
    autosomes_only: bool = True,
    bi_allelic_only: bool = True,
) -> hl.Table:
    """
    This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants
    shared by pairs of siblings in `relatedness_ht`.

    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined by
    the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of
    samples that are annotated as `SIBLINGS`.

    .. note::

        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.

    :param mt: Input Matrix table
    :param relatedness_ht: Input relationship table
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants.
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: A Table with the sibling shared variant counts
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    sib_ht = relatedness_ht.filter(
        relatedness_ht[relationship_col] == SIBLINGS)
    s_to_keep = sib_ht.aggregate(
        hl.agg.explode(lambda s: hl.agg.collect_as_set(s),
                       [sib_ht[i_col].s, sib_ht[j_col].s]),
        _localize=False,
    )
    mt = mt.filter_cols(s_to_keep.contains(mt.s))
    if "adj" not in mt.entry:
        mt = annotate_adj(mt)

    sib_stats_ht = mt.select_rows(**generate_sib_stats_expr(
        mt,
        sib_ht,
        i_col=i_col,
        j_col=j_col,
        strata={
            "raw": True,
            "adj": mt.adj
        },
    )).rows()

    return sib_stats_ht
Пример #4
0
def compute_callrate_mt(
    mt: hl.MatrixTable,
    intervals_ht: hl.Table,
    bi_allelic_only: bool = True,
    autosomes_only: bool = True,
    match: bool = True,
) -> hl.MatrixTable:
    """
    Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

    This can be used as input for imputing exome sequencing platforms.

    .. note::

        The input interval HT should have a key of type Interval.
        The resulting table will have a key of the same type as the `intervals_ht` table and
        contain an `interval_info` field containing all non-key fields of the `intervals_ht`.

    :param mt: Input MT
    :param intervals_ht: Table containing the intervals. This table has to be keyed by locus.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :param autosomes_only: If set, only autosomal intervals are used.
    :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT.
    :return: Callrate MT
    """
    logger.info("Computing call rate MatrixTable")

    if len(intervals_ht.key) != 1 or not isinstance(
            intervals_ht.key[0], hl.expr.IntervalExpression):
        logger.warning(
            "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s",
            intervals_ht.key,
        )

    if autosomes_only:
        callrate_mt = filter_to_autosomes(mt)

    if bi_allelic_only:
        callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt))

    intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key)
    callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index(
        callrate_mt.locus, all_matches=match)._interval_key)

    if match:
        callrate_mt = callrate_mt.explode_rows("_interval_key")

    callrate_mt = callrate_mt.filter_rows(
        hl.is_defined(callrate_mt._interval_key.interval))
    callrate_mt = callrate_mt.select_entries(
        GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct()))
    callrate_mt = callrate_mt.group_rows_by(
        **callrate_mt._interval_key).aggregate(
            callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT)))
    intervals_ht = intervals_ht.drop("_interval_key")
    callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct(
        **intervals_ht[callrate_mt.row_key]))
    return callrate_mt
Пример #5
0
def compute_sample_qc() -> hl.Table:
    logger.info("Computing sample QC")
    mt = filter_to_autosomes(get_gnomad_v3_mt(key_by_locus_and_alleles=True))
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = mt.select_entries('GT')

    sample_qc_ht = compute_stratified_sample_qc(
        mt,
        strata={
            'bi_allelic': bi_allelic_expr(mt),
            'multi_allelic': ~bi_allelic_expr(mt)
        },
        tmp_ht_prefix=get_sample_qc().path[:-3],
        gt_expr=mt.GT)

    # Remove annotations that cannot be computed from the sparse format
    sample_qc_ht = sample_qc_ht.annotate(
        **{
            x: sample_qc_ht[x].drop('n_called', 'n_not_called', 'n_filtered',
                                    'call_rate')
            for x in sample_qc_ht.row_value
        })
    return sample_qc_ht.repartition(100)
Пример #6
0
def get_doubleton_sites(
    vds_path: str = VDS_PATH,
    temp_path: str = TEMP_PATH,
    tranche_data: Tuple[str, int] = TRANCHE_DATA,
    sparse_entries: List[str] = SPARSE_ENTRIES,
) -> hl.Table:
    """
    Filter UKB VDS to bi-allelic, autosomal sites in interval QC pass regions with an adj allele count of two and no homozygotes.

    :param vds_path: Path to UKB 455k VDS. Default is VDS_PATH.
    :param temp_path: Path to bucket to store Table and other temporary data. Default is TEMP_PATH.
    :param tranche_data: UKB tranche data (data source and data freeze number). Default is TRANCHE_DATA.
    :param sparse_entries: List of fields to select from VDS. Default is SPARSE_ENTRIES.
    :return: Table of high quality sites with doubletons.
    """
    logger.info("Reading in VDS and filtering to bi-allelic SNPs...")
    mt = hl.vds.read_vds(vds_path).variant_data
    # Drop unnecessary annotations
    mt = mt.select_rows().select_entries(*sparse_entries)
    mt = mt.filter_rows(
        bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]))

    logger.info("Filter to autosomes and splitting multiallelics...")
    mt = mt.filter_rows(mt.locus.in_autosome())
    # NOTE: UKB dataset does not have errors with changed loci
    # (`filter_changed_loci = False` will not throw errors here)
    mt = hl.experimental.sparse_split_multi(mt)

    logger.info("Removing AS_lowqual sites...")
    info_ht = hl.read_table(info_ht_path(*tranche_data, split=True))
    mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

    logger.info("Filtering to interval QC pass regions...")
    interval_ht = hl.read_table(interval_qc_path(*tranche_data, "autosomes"))
    mt = mt.filter_rows(hl.is_defined(interval_ht[mt.locus]))

    logger.info("Filtering to adj and calculating allele count...")
    mt = filter_to_adj(mt)
    mt = mt.annotate_rows(call_stats=hl.agg.call_stats(mt.GT, mt.alleles))
    # Get AC at allele index 1 (call_stats includes a count for each allele, including reference)
    mt = mt.transmute_rows(ac=mt.call_stats.AC[1],
                           n_hom=mt.call_stats.homozygote_count[1])

    logger.info("Filtering to an allele count of two and returning...")
    ht = mt.rows()
    ht = ht.filter((ht.ac == 2) & (ht.n_hom == 0))
    ht = ht.checkpoint(f"{temp_path}/high_quality_sites.ht", overwrite=True)
    return ht
Пример #7
0
def filter_rows_for_qc(
    mt: hl.MatrixTable,
    min_af: Optional[float] = 0.001,
    min_callrate: Optional[float] = 0.99,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    apply_hard_filters: bool = True,
    bi_allelic_only: bool = True,
    snv_only: bool = True,
) -> hl.MatrixTable:
    """
    Annotates rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then applies thresholds.
    AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from GATK best practices

    .. note::

        This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD``
        if applying hard filters.

    :param mt: Input MT
    :param min_af: Minimum site AF to keep. Not applied if set to ``None``.
    :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``.
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``.
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``.
    :paramapply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30
    :parambi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too
    :paramsnv_only: Whether to only keep SNVs or include other variant types
    :return: annotated and filtered table
    """
    annotation_expr = {}

    if min_af is not None:
        annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2
    if min_callrate is not None:
        annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined(
            mt.GT))
    if min_inbreeding_coeff_threshold is not None:
        annotation_expr[
            "site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr(mt.GT)
    if min_hardy_weinberg_threshold is not None:
        annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT)

    if annotation_expr:
        mt = mt.annotate_rows(**annotation_expr)

    filter_expr = []
    if min_af is not None:
        filter_expr.append((mt.af > min_af))
    if min_callrate is not None:
        filter_expr.append((mt.site_callrate > min_callrate))
    if min_inbreeding_coeff_threshold is not None:
        filter_expr.append(
            (mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold))
    if min_hardy_weinberg_threshold is not None:
        filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold))
    if snv_only:
        filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    if bi_allelic_only:
        filter_expr.append(bi_allelic_expr(mt))

    if apply_hard_filters:
        if "info" in mt.row_value:
            if "QD" in mt.info:
                filter_expr.append((mt.info.QD >= 2))
            else:
                logger.warning(
                    "Could not apply QD hard filter, as `info.QD` not found in schema."
                )
            if "FS" in mt.info:
                filter_expr.append((mt.info.FS <= 60))
            else:
                logger.warning(
                    "Could not apply FS hard filter, as `info.FS` not found in schema."
                )
            if "MQ" in mt.info:
                filter_expr.append((mt.info.MQ >= 30))
            else:
                logger.warning(
                    "Could not apply MQ hard filter, as `info.MQ` not found in schema."
                )
        else:
            logger.warning(
                "Could not apply hard filters as `info` not found in schema.")

    return mt.filter_rows(functools.reduce(operator.iand, filter_expr))