def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed struct field in the matrix, named based on the `name` parameter. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `call_rate` (``float64``) -- Fraction of calls non-missing. - `n_called` (``int64``) -- Number of non-missing calls. - `n_not_called` (``int64``) -- Number of missing calls. - `n_hom_ref` (``int64``) -- Number of homozygous reference calls. - `n_het` (``int64``) -- Number of heterozygous calls. - `n_hom_var` (``int64``) -- Number of homozygous alternate calls. - `n_non_ref` (``int64``) -- Sum of ``n_het`` and ``n_hom_var``. - `n_snp` (``int64``) -- Number of SNP alternate alleles. - `n_insertion` (``int64``) -- Number of insertion alternate alleles. - `n_deletion` (``int64``) -- Number of deletion alternate alleles. - `n_singleton` (``int64``) -- Number of private alleles. - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles. - `n_transversion` (``int64``) -- Number of transversion alternate alleles. - `n_star` (``int64``) -- Number of star (upstream deletion) alleles. - `r_ti_tv` (``float64``) -- Transition/Transversion ratio. - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio. - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio. Missing values ``NA`` may result from division by zero. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ require_row_key_variant(mt, 'sample_qc') from hail.expr.functions import _num_allele_type , _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows(**{variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))}) exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'sample_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref()) exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het()) exprs['n_singleton'] = hl.agg.sum(hl.sum(hl.range(0, mt['GT'].ploidy).map(lambda i: mt[variant_ac][mt['GT'][i]] == 1))) def get_allele_type(allele_idx): return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32)) exprs['allele_type_counts'] = hl.agg.explode( lambda elt: hl.agg.counter(elt), hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i]))) mt = mt.annotate_cols(**{name: hl.struct(**exprs)}) zero = hl.int64(0) select_exprs = {} if 'dp_stats' in exprs: select_exprs['dp_stats'] = mt[name].dp_stats if 'gq_stats' in exprs: select_exprs['gq_stats'] = mt[name].gq_stats select_exprs = { **select_exprs, 'call_rate': hl.float64(mt[name].n_called) / (mt[name].n_called + mt[name].n_not_called), 'n_called': mt[name].n_called, 'n_not_called': mt[name].n_not_called, 'n_hom_ref': mt[name].n_hom_ref, 'n_het': mt[name].n_het, 'n_hom_var': mt[name].n_called - mt[name].n_hom_ref - mt[name].n_het, 'n_non_ref': mt[name].n_called - mt[name].n_hom_ref, 'n_singleton': mt[name].n_singleton, 'n_snp': mt[name].allele_type_counts.get(allele_ints["Transition"], zero) + \ mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_insertion': mt[name].allele_type_counts.get(allele_ints["Insertion"], zero), 'n_deletion': mt[name].allele_type_counts.get(allele_ints["Deletion"], zero), 'n_transition': mt[name].allele_type_counts.get(allele_ints["Transition"], zero), 'n_transversion': mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_star': mt[name].allele_type_counts.get(allele_ints["Star"], zero) } mt = mt.annotate_cols(**{name: mt[name].select(**select_exprs)}) mt = mt.annotate_cols(**{name: mt[name].annotate( r_ti_tv=divide_null(hl.float64(mt[name].n_transition), mt[name].n_transversion), r_het_hom_var=divide_null(hl.float64(mt[name].n_het), mt[name].n_hom_var), r_insertion_deletion=divide_null(hl.float64(mt[name].n_insertion), mt[name].n_deletion) )}) mt = mt.drop(variant_ac, variant_atypes) return mt
def sample_qc(vds: 'VariantDataset', *, name='sample_qc', gq_bins: 'Sequence[int]' = (0, 20, 60)) -> 'Table': """Run sample_qc on dataset in the split :class:`.VariantDataset` representation. Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. name : :obj:`str` Name for resulting field. gq_bins : :class:`tuple` of :obj:`int` Tuple containing cutoffs for genotype quality (GQ) scores. Returns ------- :class:`.Table` Hail Table of results, keyed by sample. """ require_first_key_field_locus(vds.reference_data, 'sample_qc') require_first_key_field_locus(vds.variant_data, 'sample_qc') from hail.expr.functions import _num_allele_type, _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind( lambda at: hl.if_else(at == allele_ints['SNP'], hl.if_else(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt) ) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() vmt = vds.variant_data if 'GT' not in vmt.entry: vmt = vmt.annotate_entries(GT=hl.experimental.lgt_to_gt(vmt.LGT, vmt.LA)) vmt = vmt.annotate_rows(**{ variant_ac: hl.agg.call_stats(vmt.GT, vmt.alleles).AC, variant_atypes: vmt.alleles[1:].map(lambda alt: allele_type(vmt.alleles[0], alt)) }) bound_exprs = {} bound_exprs['n_het'] = hl.agg.count_where(vmt['GT'].is_het()) bound_exprs['n_hom_var'] = hl.agg.count_where(vmt['GT'].is_hom_var()) bound_exprs['n_singleton'] = hl.agg.sum( hl.sum(hl.range(0, vmt['GT'].ploidy).map(lambda i: vmt[variant_ac][vmt['GT'][i]] == 1)) ) bound_exprs['allele_type_counts'] = hl.agg.explode( lambda allele_type: hl.tuple( hl.agg.count_where(allele_type == i) for i in range(len(allele_ints)) ), (hl.range(0, vmt['GT'].ploidy) .map(lambda i: vmt['GT'][i]) .filter(lambda allele_idx: allele_idx > 0) .map(lambda allele_idx: vmt[variant_atypes][allele_idx - 1])) ) gq_exprs = hl.agg.filter( hl.is_defined(vmt.GT), hl.struct(**{f'gq_over_{x}': hl.agg.count_where(vmt.GQ > x) for x in gq_bins}) ) result_struct = hl.rbind( hl.struct(**bound_exprs), lambda x: hl.rbind( hl.struct(**{ 'gq_exprs': gq_exprs, 'n_het': x.n_het, 'n_hom_var': x.n_hom_var, 'n_non_ref': x.n_het + x.n_hom_var, 'n_singleton': x.n_singleton, 'n_snp': (x.allele_type_counts[allele_ints['Transition']] + x.allele_type_counts[allele_ints['Transversion']]), 'n_insertion': x.allele_type_counts[allele_ints['Insertion']], 'n_deletion': x.allele_type_counts[allele_ints['Deletion']], 'n_transition': x.allele_type_counts[allele_ints['Transition']], 'n_transversion': x.allele_type_counts[allele_ints['Transversion']], 'n_star': x.allele_type_counts[allele_ints['Star']] }), lambda s: s.annotate( r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion), r_het_hom_var=divide_null(hl.float64(s.n_het), s.n_hom_var), r_insertion_deletion=divide_null(hl.float64(s.n_insertion), s.n_deletion) ) ) ) variant_results = vmt.select_cols(**result_struct).cols() rmt = vds.reference_data ref_results = rmt.select_cols( gq_exprs=hl.struct(**{ f'gq_over_{x}': hl.agg.filter(rmt.GQ > x, hl.agg.sum(1 + rmt.END - rmt.locus.position)) for x in gq_bins }) ).cols() joined = ref_results[variant_results.key].gq_exprs joined_results = variant_results.transmute(**{ f'gq_over_{x}': variant_results.gq_exprs[f'gq_over_{x}'] + joined[f'gq_over_{x}'] for x in gq_bins }) return joined_results
def merge_sample_qc_expr( sample_qc_exprs: List[hl.expr.StructExpression], ) -> hl.expr.StructExpression: """ Create an expression that merges results from non-overlapping strata of hail.sample_qc. E.g.: - Compute autosomes and sex chromosomes metrics separately, then merge results - Compute bi-allelic and multi-allelic metrics separately, then merge results Note regarding the merging of ``dp_stats`` and ``gq_stats``: Because ``n`` is needed to aggregate ``stdev``, ``n_called`` is used for this purpose. This should work very well on a standard GATK VCF and it essentially assumes that: - samples that are called have `DP` and `GQ` fields - samples that are not called do not have `DP` and `GQ` fields Even if these assumptions are broken for some genotypes, it shouldn't matter too much. :param sample_qc_exprs: List of sample QC struct expressions for each stratification :return: Combined sample QC results """ # List of metrics that can be aggregated by summing additive_metrics = ([ "n_called", "n_not_called", "n_filtered", "n_hom_ref", "n_het", "n_hom_var", "n_non_ref", "n_snp", "n_insertion", "n_deletion", "n_singleton", "n_transition", "n_transversion", "n_star", "n_singleton_ti", "n_singleton_tv", ] + ["gq_over_" + f"{GQ}" for GQ in range(0, 70, 10)] + ["dp_over_" + f"{DP}" for DP in range(0, 40, 10)]) # List of metrics that are ratio of summed metrics (name, nominator, denominator) ratio_metrics = [ ("call_rate", "n_called", "n_not_called"), ("r_ti_tv", "n_transition", "n_transversion"), ("r_ti_tv_singleton", "n_singleton_ti", "n_singleton_tv"), ("r_het_hom_var", "n_het", "n_hom_var"), ("r_insertion_deletion", "n_insertion", "n_deletion"), ] # List of metrics that are struct generated by a stats counter stats_metrics = ["gq_stats", "dp_stats"] # Gather metrics present in sample qc fields sample_qc_fields = set(sample_qc_exprs[0]) for sample_qc_expr in sample_qc_exprs[1:]: sample_qc_fields = sample_qc_fields.union(set(sample_qc_expr)) # Merge additive metrics in sample qc fields merged_exprs = { metric: hl.sum([sample_qc_expr[metric] for sample_qc_expr in sample_qc_exprs]) for metric in additive_metrics if metric in sample_qc_fields } # Merge ratio metrics in sample qc fields merged_exprs.update({ metric: hl.float64(divide_null(merged_exprs[nom], merged_exprs[denom])) for metric, nom, denom in ratio_metrics if nom in sample_qc_fields and denom in sample_qc_fields }) # Merge stats counter metrics in sample qc fields # Use n_called as n for DP and GQ stats if "n_called" in sample_qc_fields: merged_exprs.update({ metric: merge_stats_counters_expr([ sample_qc_expr[metric].annotate(n=sample_qc_expr.n_called) for sample_qc_expr in sample_qc_exprs ]).drop("n") for metric in stats_metrics if metric in sample_qc_fields }) return hl.struct(**merged_exprs)
def sample_qc(vds: 'VariantDataset', *, gq_bins: 'Sequence[int]' = (0, 20, 60), dp_bins: 'Sequence[int]' = (0, 1, 10, 20, 30), dp_field=None) -> 'Table': """Compute sample quality metrics about a :class:`.VariantDataset`. If the `dp_field` parameter is not specified, the ``DP`` is used for depth if present. If no ``DP`` field is present, the ``MIN_DP`` field is used. If no ``DP`` or ``MIN_DP`` field is present, no depth statistics will be calculated. Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. name : :obj:`str` Name for resulting field. gq_bins : :class:`tuple` of :obj:`int` Tuple containing cutoffs for genotype quality (GQ) scores. dp_bins : :class:`tuple` of :obj:`int` Tuple containing cutoffs for depth (DP) scores. dp_field : :obj:`str` Name of depth field. If not supplied, DP or MIN_DP will be used, in that order. Returns ------- :class:`.Table` Hail Table of results, keyed by sample. """ require_first_key_field_locus(vds.reference_data, 'sample_qc') require_first_key_field_locus(vds.variant_data, 'sample_qc') ref = vds.reference_data if 'DP' in ref.entry: ref_dp_field_to_use = 'DP' elif 'MIN_DP' in ref.entry: ref_dp_field_to_use = 'MIN_DP' else: ref_dp_field_to_use = dp_field from hail.expr.functions import _num_allele_type, _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind( lambda at: hl.if_else( at == allele_ints['SNP'], hl.if_else(hl.is_transition(ref, alt), allele_ints[ 'Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() vmt = vds.variant_data if 'GT' not in vmt.entry: vmt = vmt.annotate_entries( GT=hl.experimental.lgt_to_gt(vmt.LGT, vmt.LA)) vmt = vmt.annotate_rows( **{ variant_ac: hl.agg.call_stats(vmt.GT, vmt.alleles).AC, variant_atypes: vmt.alleles[1:].map(lambda alt: allele_type(vmt.alleles[0], alt)) }) bound_exprs = {} bound_exprs['n_het'] = hl.agg.count_where(vmt['GT'].is_het()) bound_exprs['n_hom_var'] = hl.agg.count_where(vmt['GT'].is_hom_var()) bound_exprs['n_singleton'] = hl.agg.sum( hl.rbind( vmt['GT'], lambda gt: hl.sum( hl.range(0, gt.ploidy).map(lambda i: hl.rbind( gt[i], lambda gti: (gti != 0) & (vmt[variant_ac][gti] == 1)))))) bound_exprs['allele_type_counts'] = hl.agg.explode( lambda allele_type: hl.tuple( hl.agg.count_where(allele_type == i) for i in range(len(allele_ints))), (hl.range(0, vmt['GT'].ploidy).map(lambda i: vmt['GT'][i]).filter( lambda allele_idx: allele_idx > 0).map( lambda allele_idx: vmt[variant_atypes][allele_idx - 1]))) dp_exprs = {} if ref_dp_field_to_use is not None and 'DP' in vmt.entry: dp_exprs['dp'] = hl.tuple( hl.agg.count_where(vmt.DP >= x) for x in dp_bins) gq_dp_exprs = hl.struct( **{'gq': hl.tuple(hl.agg.count_where(vmt.GQ >= x) for x in gq_bins)}, **dp_exprs) result_struct = hl.rbind( hl.struct(**bound_exprs), lambda x: hl.rbind( hl.struct( **{ 'gq_dp_exprs': gq_dp_exprs, 'n_het': x.n_het, 'n_hom_var': x.n_hom_var, 'n_non_ref': x.n_het + x.n_hom_var, 'n_singleton': x.n_singleton, 'n_snp': (x.allele_type_counts[allele_ints['Transition']] + x. allele_type_counts[allele_ints['Transversion']]), 'n_insertion': x.allele_type_counts[allele_ints['Insertion']], 'n_deletion': x.allele_type_counts[allele_ints['Deletion']], 'n_transition': x.allele_type_counts[allele_ints['Transition']], 'n_transversion': x.allele_type_counts[allele_ints['Transversion']], 'n_star': x.allele_type_counts[allele_ints['Star']] }), lambda s: s.annotate(r_ti_tv=divide_null( hl.float64(s.n_transition), s.n_transversion), r_het_hom_var=divide_null( hl.float64(s.n_het), s.n_hom_var), r_insertion_deletion=divide_null( hl.float64(s.n_insertion), s. n_deletion)))) variant_results = vmt.select_cols(**result_struct).cols() rmt = vds.reference_data ref_dp_expr = {} if ref_dp_field_to_use is not None: ref_dp_expr['ref_bases_over_dp_threshold'] = hl.tuple( hl.agg.filter(rmt[ref_dp_field_to_use] >= x, hl.agg.sum(1 + rmt.END - rmt.locus.position)) for x in dp_bins) ref_results = rmt.select_cols(ref_bases_over_gq_threshold=hl.tuple( hl.agg.filter(rmt.GQ >= x, hl.agg.sum(1 + rmt.END - rmt.locus.position)) for x in gq_bins), **ref_dp_expr).cols() joined = ref_results[variant_results.key] joined_dp_expr = {} dp_bins_field = {} if ref_dp_field_to_use is not None: joined_dp_expr['bases_over_dp_threshold'] = hl.tuple( x + y for x, y in zip(variant_results.gq_dp_exprs.dp, joined.ref_bases_over_dp_threshold)) dp_bins_field['dp_bins'] = hl.tuple(dp_bins) joined_results = variant_results.transmute( bases_over_gq_threshold=hl.tuple( x + y for x, y in zip(variant_results.gq_dp_exprs.gq, joined.ref_bases_over_gq_threshold)), **joined_dp_expr) joined_results = joined_results.annotate_globals(gq_bins=hl.tuple(gq_bins), **dp_bins_field) return joined_results