def compute_sample_qc(data_type: str = "genomes") -> hl.Table: """ Perform sample QC on the split VDS table using `compute_stratified_sample_qc`. :param data_type: Whether data is from genomes or exomes, default is genomes :return: Table containing sample QC metrics :rtype: hl.Table """ logger.info("Computing sample QC on CCDG %s VDS", data_type) vds = get_qc_vds( data_type=data_type, autosome_only=True, split=True, interval_qc=True, ) # Use modified compute_stratified_sample_qc with the vds option sample_qc_ht = compute_stratified_sample_qc( vds, strata={ "bi_allelic": bi_allelic_expr(vds.variant_data), "multi_allelic": ~bi_allelic_expr(vds.variant_data), }, tmp_ht_prefix=get_ccdg_results_path(data_type=data_type, result="sample_qc")[ :-3 ], ) return sample_qc_ht.repartition(100)
def generate_trio_stats( mt: hl.MatrixTable, autosomes_only: bool = True, bi_allelic_only: bool = True ) -> hl.Table: """ Default function to run `generate_trio_stats_expr` to get trio stats stratified by raw and adj .. note:: Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with a sparse MT `hl.experimental.densify` must be run first. By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense :param autosomes_only: If set, only autosomal intervals are used. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :return: Table with trio stats """ if autosomes_only: mt = filter_to_autosomes(mt) if bi_allelic_only: mt = mt.filter_rows(bi_allelic_expr(mt)) logger.info(f"Generating trio stats using {mt.count_cols()} trios.") trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj ht = mt.select_rows( **generate_trio_stats_expr( mt, transmitted_strata={"raw": True, "adj": trio_adj}, de_novo_strata={"raw": True, "adj": trio_adj}, ac_strata={"raw": True, "adj": trio_adj}, ) ).rows() return ht
def generate_sib_stats( mt: hl.MatrixTable, relatedness_ht: hl.Table, i_col: str = "i", j_col: str = "j", relationship_col: str = "relationship", autosomes_only: bool = True, bi_allelic_only: bool = True, ) -> hl.Table: """ This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants shared by pairs of siblings in `relatedness_ht`. This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too). The `relationship_col` should be a column specifying the relationship between each two samples as defined by the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of samples that are annotated as `SIBLINGS`. .. note:: By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. :param mt: Input Matrix table :param relatedness_ht: Input relationship table :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants. :param autosomes_only: If set, only autosomal intervals are used. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :return: A Table with the sibling shared variant counts """ if autosomes_only: mt = filter_to_autosomes(mt) if bi_allelic_only: mt = mt.filter_rows(bi_allelic_expr(mt)) sib_ht = relatedness_ht.filter( relatedness_ht[relationship_col] == SIBLINGS) s_to_keep = sib_ht.aggregate( hl.agg.explode(lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s]), _localize=False, ) mt = mt.filter_cols(s_to_keep.contains(mt.s)) if "adj" not in mt.entry: mt = annotate_adj(mt) sib_stats_ht = mt.select_rows(**generate_sib_stats_expr( mt, sib_ht, i_col=i_col, j_col=j_col, strata={ "raw": True, "adj": mt.adj }, )).rows() return sib_stats_ht
def compute_callrate_mt( mt: hl.MatrixTable, intervals_ht: hl.Table, bi_allelic_only: bool = True, autosomes_only: bool = True, match: bool = True, ) -> hl.MatrixTable: """ Compute a sample/interval MT with each entry containing the call rate for that sample/interval. This can be used as input for imputing exome sequencing platforms. .. note:: The input interval HT should have a key of type Interval. The resulting table will have a key of the same type as the `intervals_ht` table and contain an `interval_info` field containing all non-key fields of the `intervals_ht`. :param mt: Input MT :param intervals_ht: Table containing the intervals. This table has to be keyed by locus. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :param autosomes_only: If set, only autosomal intervals are used. :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT. :return: Callrate MT """ logger.info("Computing call rate MatrixTable") if len(intervals_ht.key) != 1 or not isinstance( intervals_ht.key[0], hl.expr.IntervalExpression): logger.warning( "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s", intervals_ht.key, ) if autosomes_only: callrate_mt = filter_to_autosomes(mt) if bi_allelic_only: callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt)) intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key) callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index( callrate_mt.locus, all_matches=match)._interval_key) if match: callrate_mt = callrate_mt.explode_rows("_interval_key") callrate_mt = callrate_mt.filter_rows( hl.is_defined(callrate_mt._interval_key.interval)) callrate_mt = callrate_mt.select_entries( GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct())) callrate_mt = callrate_mt.group_rows_by( **callrate_mt._interval_key).aggregate( callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT))) intervals_ht = intervals_ht.drop("_interval_key") callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct( **intervals_ht[callrate_mt.row_key])) return callrate_mt
def compute_sample_qc() -> hl.Table: logger.info("Computing sample QC") mt = filter_to_autosomes(get_gnomad_v3_mt(key_by_locus_and_alleles=True)) mt = mt.filter_rows(hl.len(mt.alleles) > 1) mt = mt.select_entries('GT') sample_qc_ht = compute_stratified_sample_qc( mt, strata={ 'bi_allelic': bi_allelic_expr(mt), 'multi_allelic': ~bi_allelic_expr(mt) }, tmp_ht_prefix=get_sample_qc().path[:-3], gt_expr=mt.GT) # Remove annotations that cannot be computed from the sparse format sample_qc_ht = sample_qc_ht.annotate( **{ x: sample_qc_ht[x].drop('n_called', 'n_not_called', 'n_filtered', 'call_rate') for x in sample_qc_ht.row_value }) return sample_qc_ht.repartition(100)
def get_doubleton_sites( vds_path: str = VDS_PATH, temp_path: str = TEMP_PATH, tranche_data: Tuple[str, int] = TRANCHE_DATA, sparse_entries: List[str] = SPARSE_ENTRIES, ) -> hl.Table: """ Filter UKB VDS to bi-allelic, autosomal sites in interval QC pass regions with an adj allele count of two and no homozygotes. :param vds_path: Path to UKB 455k VDS. Default is VDS_PATH. :param temp_path: Path to bucket to store Table and other temporary data. Default is TEMP_PATH. :param tranche_data: UKB tranche data (data source and data freeze number). Default is TRANCHE_DATA. :param sparse_entries: List of fields to select from VDS. Default is SPARSE_ENTRIES. :return: Table of high quality sites with doubletons. """ logger.info("Reading in VDS and filtering to bi-allelic SNPs...") mt = hl.vds.read_vds(vds_path).variant_data # Drop unnecessary annotations mt = mt.select_rows().select_entries(*sparse_entries) mt = mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1])) logger.info("Filter to autosomes and splitting multiallelics...") mt = mt.filter_rows(mt.locus.in_autosome()) # NOTE: UKB dataset does not have errors with changed loci # (`filter_changed_loci = False` will not throw errors here) mt = hl.experimental.sparse_split_multi(mt) logger.info("Removing AS_lowqual sites...") info_ht = hl.read_table(info_ht_path(*tranche_data, split=True)) mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual) logger.info("Filtering to interval QC pass regions...") interval_ht = hl.read_table(interval_qc_path(*tranche_data, "autosomes")) mt = mt.filter_rows(hl.is_defined(interval_ht[mt.locus])) logger.info("Filtering to adj and calculating allele count...") mt = filter_to_adj(mt) mt = mt.annotate_rows(call_stats=hl.agg.call_stats(mt.GT, mt.alleles)) # Get AC at allele index 1 (call_stats includes a count for each allele, including reference) mt = mt.transmute_rows(ac=mt.call_stats.AC[1], n_hom=mt.call_stats.homozygote_count[1]) logger.info("Filtering to an allele count of two and returning...") ht = mt.rows() ht = ht.filter((ht.ac == 2) & (ht.n_hom == 0)) ht = ht.checkpoint(f"{temp_path}/high_quality_sites.ht", overwrite=True) return ht
def filter_rows_for_qc( mt: hl.MatrixTable, min_af: Optional[float] = 0.001, min_callrate: Optional[float] = 0.99, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, apply_hard_filters: bool = True, bi_allelic_only: bool = True, snv_only: bool = True, ) -> hl.MatrixTable: """ Annotates rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then applies thresholds. AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from GATK best practices .. note:: This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD`` if applying hard filters. :param mt: Input MT :param min_af: Minimum site AF to keep. Not applied if set to ``None``. :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. :paramapply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30 :parambi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too :paramsnv_only: Whether to only keep SNVs or include other variant types :return: annotated and filtered table """ annotation_expr = {} if min_af is not None: annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2 if min_callrate is not None: annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined( mt.GT)) if min_inbreeding_coeff_threshold is not None: annotation_expr[ "site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr(mt.GT) if min_hardy_weinberg_threshold is not None: annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT) if annotation_expr: mt = mt.annotate_rows(**annotation_expr) filter_expr = [] if min_af is not None: filter_expr.append((mt.af > min_af)) if min_callrate is not None: filter_expr.append((mt.site_callrate > min_callrate)) if min_inbreeding_coeff_threshold is not None: filter_expr.append( (mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold)) if min_hardy_weinberg_threshold is not None: filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold)) if snv_only: filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1])) if bi_allelic_only: filter_expr.append(bi_allelic_expr(mt)) if apply_hard_filters: if "info" in mt.row_value: if "QD" in mt.info: filter_expr.append((mt.info.QD >= 2)) else: logger.warning( "Could not apply QD hard filter, as `info.QD` not found in schema." ) if "FS" in mt.info: filter_expr.append((mt.info.FS <= 60)) else: logger.warning( "Could not apply FS hard filter, as `info.FS` not found in schema." ) if "MQ" in mt.info: filter_expr.append((mt.info.MQ >= 30)) else: logger.warning( "Could not apply MQ hard filter, as `info.MQ` not found in schema." ) else: logger.warning( "Could not apply hard filters as `info` not found in schema.") return mt.filter_rows(functools.reduce(operator.iand, filter_expr))