def test_concordance(self): dataset = get_dataset() glob_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset) self.assertEqual(sum([sum(glob_conc[i]) for i in range(5)]), dataset.count_rows() * dataset.count_cols()) counts = dataset.aggregate_entries(hl.Struct(n_het=agg.filter(dataset.GT.is_het(), agg.count()), n_hom_ref=agg.filter(dataset.GT.is_hom_ref(), agg.count()), n_hom_var=agg.filter(dataset.GT.is_hom_var(), agg.count()), nNoCall=agg.filter(hl.is_missing(dataset.GT), agg.count()))) self.assertEqual(glob_conc[0][0], 0) self.assertEqual(glob_conc[1][1], counts.nNoCall) self.assertEqual(glob_conc[2][2], counts.n_hom_ref) self.assertEqual(glob_conc[3][3], counts.n_het) self.assertEqual(glob_conc[4][4], counts.n_hom_var) [self.assertEqual(glob_conc[i][j], 0) for i in range(5) for j in range(5) if i != j] self.assertTrue(cols_conc.all(hl.sum(hl.flatten(cols_conc.concordance)) == dataset.count_rows())) self.assertTrue(rows_conc.all(hl.sum(hl.flatten(rows_conc.concordance)) == dataset.count_cols())) cols_conc.write('/tmp/foo.kt', overwrite=True) rows_conc.write('/tmp/foo.kt', overwrite=True)
def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) )
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))), info=hl.struct( DP=hl.sum(ts.data.map(lambda d: d.info.DP)), MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB[0])), hl.sum(ts.data.map(lambda d: d.info.SB[1])), hl.sum(ts.data.map(lambda d: d.info.SB[2])), hl.sum(ts.data.map(lambda d: d.info.SB[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]])))), hl.dict(hl.range(0, hl.len(tmp.alleles)).map( lambda j: hl.tuple([tmp.alleles[j], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def gnomad_coverage_stats_optimized(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.annotate_rows(mean=hl.agg.mean(mt.x), count_array=hl.rbind(hl.agg.counter(hl.min(100, mt.x)), lambda c: hl.range(0, 100).map(lambda i: c.get(i, 0)))) mt = mt.annotate_rows(median=hl.rbind(hl.sum(mt.count_array) / 2, lambda s: hl.find(lambda x: x > s, hl.array_scan( lambda i, j: i + j, 0, mt.count_array))), **{f'above_{x}': hl.sum(mt.count_array[x:]) for x in [1, 5, 10, 15, 20, 25, 30, 50, 100]} ) mt.rows()._force_count()
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), info=hl.struct( MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB_TABLE=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[0])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[1])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[2])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.array([0]).extend( hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]]))))), hl.dict(hl.range(1, hl.len(tmp.alleles) + 1).map( lambda j: hl.tuple([tmp.alleles[j - 1], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def main(args): ss = args.ss.split(',') chr_pos_ref_alt_p_beta = args.chr_pos_ref_alt_p_beta.split(';') ss_names = args.ss_names.split(',') sumstats = [] # read in each set of sumstats for sumstat in range(len(ss)): ss_data = import_key(ss[sumstat], chr_pos_ref_alt_p_beta[sumstat], ss_names[sumstat]) sumstats.append(ss_data) ss_joined = sumstats[0] for sumstat in range(1, len(ss)): ss_joined = ss_joined.join(sumstats[sumstat], 'outer') ss_joined = annotate_nearest_gene(ss_joined, add_only_gene_symbols_as_str=True) ss_joined = ss_joined.key_by() ss_joined = ss_joined.select(chrom=ss_joined.locus.contig, pos=ss_joined.locus.position, ref=ss_joined.alleles[0], alt=ss_joined.alleles[1], nearest_genes=ss_joined.nearest_genes, **ss_joined.row.drop('locus', 'alleles', 'nearest_genes')) p_colnames = [x for x in ss_joined.row if x.startswith('p_')] ss_filt = ss_joined.filter( hl.sum([hl.is_defined(ss_joined[x]) for x in p_colnames]) > 1) ss_filt.export(args.out)
def maf_filter(mt, maf, filter_ac0_after_pruning=False): """ Takes matrix table, filters out failing genotypes, variants, and samples, and MAF prunes the table, and returns the matrix table :param mt: matrix table to prune (should be LD pruned and have x chrom removed). :param filter_ac0_after_pruning: filter variants no longer in the data, e.g. sum(AC) = 0? :return: returns maf filtered matrix table. """ # Run hl.variant_qc() to get AFs mt = hl.variant_qc(mt) # Filter MAF logging.info(f'Filtering out variants with minor allele frequency < {maf}') mt = mt.filter_rows(mt.row.variant_qc.AF[1] > maf, keep=True) mt = mt.annotate_globals(maf_threshold_LDpruning=maf) if filter_ac0_after_pruning: logging.info( 'Removing variants with alt allele count = 0 (monomorphic variants).' ) mt = hl.variant_qc(mt) mt = mt.filter_rows(hl.sum(mt.row.variant_qc.AC) == hl.int(0), keep=False) count = mt.count() logging.info( f"MT count after removing monomorphic variants and MAF filtering: {count}" ) else: logging.info("MAF pruned mt count:" + str(mt.count())) return mt
def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure).when( (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure).when( (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default(hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()))
def qual_hist_expr( gt_expr: Optional[hl.expr.CallExpression] = None, gq_expr: Optional[hl.expr.NumericExpression] = None, dp_expr: Optional[hl.expr.NumericExpression] = None, ad_expr: Optional[hl.expr.ArrayNumericExpression] = None, adj_expr: Optional[hl.expr.BooleanExpression] = None, ) -> hl.expr.StructExpression: """ Return a struct expression with genotype quality histograms based on the arguments given (dp, gq, ad). .. note:: - If `gt_expr` is provided, will return histograms for non-reference samples only as well as all samples. - `gt_expr` is required for the allele-balance histogram, as it is only computed on het samples. - If `adj_expr` is provided, additional histograms are computed using only adj samples. :param gt_expr: Entry expression containing genotype :param gq_expr: Entry expression containing genotype quality :param dp_expr: Entry expression containing depth :param ad_expr: Entry expression containing allelic depth (bi-allelic here) :param adj_expr: Entry expression containing adj (high quality) genotype status :return: Genotype quality histograms expression """ qual_hists = {} if gq_expr is not None: qual_hists["gq_hist"] = hl.agg.hist(gq_expr, 0, 100, 20) if dp_expr is not None: qual_hists["dp_hist"] = hl.agg.hist(dp_expr, 0, 100, 20) if gt_expr is not None: qual_hists = { **{ f"{qual_hist_name}_all": qual_hist_expr for qual_hist_name, qual_hist_expr in qual_hists.items() }, **{ f"{qual_hist_name}_alt": hl.agg.filter(gt_expr.is_non_ref(), qual_hist_expr) for qual_hist_name, qual_hist_expr in qual_hists.items() }, } if ad_expr is not None: qual_hists["ab_hist_alt"] = hl.agg.filter( gt_expr.is_het(), hl.agg.hist(ad_expr[1] / hl.sum(ad_expr), 0, 1, 20)) else: qual_hists = { f"{qual_hist_name}_all": qual_hist_expr for qual_hist_name, qual_hist_expr in qual_hists.items() } if adj_expr is not None: qual_hists.update({ f"{qual_hist_name}_adj": hl.agg.filter(adj_expr, qual_hist_expr) for qual_hist_name, qual_hist_expr in qual_hists.items() }) return hl.struct(**qual_hists)
def table_aggregate_int_stats(): ht = hl.read_table(resource('many_ints_table.ht')) ht.aggregate( tuple([ *(hl.agg.stats(ht[f'i{i}']) for i in range(5)), *(hl.agg.stats(hl.sum(ht[f'array{i}'])) for i in range(2)), *(hl.agg.explode(lambda elt: hl.agg.stats(elt), ht[f'array{i}']) for i in range(2)) ]))
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table: ht = ht.transmute(phase_info=hl.array(ht.phase_info)) ht = ht.explode('phase_info') ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1]) if remove_all_ref: ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0) return ht
def table_aggregate_int_stats(ht_path): ht = hl.read_table(ht_path) ht.aggregate( tuple([ *(hl.agg.stats(ht[f'i{i}']) for i in range(5)), *(hl.agg.stats(hl.sum(ht[f'array{i}'])) for i in range(2)), *(hl.agg.explode(lambda elt: hl.agg.stats(elt), ht[f'array{i}']) for i in range(2)) ]))
def split_multi(ds): sm = hl.SplitMulti(ds) sm.update_rows(a_index=sm.a_index(), was_split=sm.was_split()) sm.update_entries( GT=hl.downcode(ds.GT, sm.a_index()), AD=hl.or_missing(hl.is_defined(ds.AD), [hl.sum(ds.AD) - ds.AD[sm.a_index()], ds.AD[sm.a_index()]]), DP=ds.DP ) split_ds = sm.result() return split_ds
def chet_likelihood_expr(gt_counts, e: float = 1e-6, distance: int = None): """ ### Het model | Haplotype | Freq | |-----------|:-------:| | aB | p | | Ab | q | | ab | e | | AB | 1-p-q-e | Therefore, we have the following frequencies: | v0/v1 | BB | Bb | bb | |-------|:---------------------:|:---------------------:|:-------------:| | AA | (1-p-q-e)<sup>2</sup> | 2*(1-p-q-e)*q | q<sup>2</sup> | | Aa | 2*(1-p-q-e)*p | 2*(p*q + (1-p-q-e)*e) | 2*q*e | | aa | p<sup>2<sup> | 2*p*e | e<sup>2</sup> | :param gt_counts: :param e: :param distance: :return: """ n = 2 * hl.sum(gt_counts) p = (gt_counts[3] + gt_counts[4] + gt_counts[7] + 2 * gt_counts[6]) / n q = (gt_counts[1] + gt_counts[4] + gt_counts[5] + 2 * gt_counts[2]) / n x = 1 - p - q - e # Compute log-likelihoods def compute_chet_log_like(n, p, q, x): res = (hl.cond((p > 0) & (q > 0), hl.fold( lambda i, j: i + j[0] * j[1], 0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * q), hl.log10(q) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * q + x * e)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * e), hl.log10(e) * 2 ])), -1e-31)) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.03 + 0.03 * hl.log(distance - 1))) return res return hl.bind(compute_chet_log_like, n, p, q, x)
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.rbind( old_entry.LGT, lambda lgt: hl.if_else( lgt.is_non_ref(), hl.downcode( lgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lgt)) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.rbind( old_entry.LPGT, lambda lpgt: hl.if_else( lpgt.is_non_ref(), hl.downcode( lpgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lpgt)) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def variant_qc_aggregator(mt) -> hl.MatrixTable: """:func:`.variant_qc` as an aggregator.""" bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( "'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) n_cols = hl.agg.count() bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) return hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value })))
def test_concordance(self): dataset = get_dataset() glob_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset) self.assertEqual(sum([sum(glob_conc[i]) for i in range(5)]), dataset.count_rows() * dataset.count_cols()) counts = dataset.aggregate_entries( hl.Struct(n_het=agg.count( agg.filter(dataset.GT.is_het(), dataset.GT)), n_hom_ref=agg.count( agg.filter(dataset.GT.is_hom_ref(), dataset.GT)), n_hom_var=agg.count( agg.filter(dataset.GT.is_hom_var(), dataset.GT)), nNoCall=agg.count( agg.filter(hl.is_missing(dataset.GT), dataset.GT)))) self.assertEqual(glob_conc[0][0], 0) self.assertEqual(glob_conc[1][1], counts.nNoCall) self.assertEqual(glob_conc[2][2], counts.n_hom_ref) self.assertEqual(glob_conc[3][3], counts.n_het) self.assertEqual(glob_conc[4][4], counts.n_hom_var) [ self.assertEqual(glob_conc[i][j], 0) for i in range(5) for j in range(5) if i != j ] self.assertTrue( cols_conc.all( hl.sum(hl.flatten(cols_conc.concordance)) == dataset.count_rows())) self.assertTrue( rows_conc.all( hl.sum(hl.flatten(rows_conc.concordance)) == dataset.count_cols())) cols_conc.write('/tmp/foo.kt', overwrite=True) rows_conc.write('/tmp/foo.kt', overwrite=True)
def get_platform_specific_intervals(platform_pc_loadings_ht: hl.Table, threshold: float) -> List[hl.Interval]: """ This takes the platform PC loadings and returns a list of intervals where the sum of the loadings above the given threshold. The experimental / untested idea behind this, is that those intervals may be problematic on some platforms. :param Table platform_pc_loadings_ht: Platform PCA loadings indexed by interval :param float threshold: Minimal threshold :param str intervals_path: Path to the intervals file to use (default: b37 exome calling intervals) :return: List of intervals with PC loadings above the given threshold :rtype: list of Interval """ platform_specific_intervals = platform_pc_loadings_ht.filter( hl.sum(hl.abs(platform_pc_loadings_ht.loadings)) >= threshold) return platform_specific_intervals.interval.collect()
def load_gene_data(directory: str, pheno_key_dict, gene_ht_map_path: str, n_cases: int = -1, n_controls: int = -1, heritability: float = -1.0, saige_version: str = 'NA', inv_normalized: str = 'NA', overwrite: bool = False): output_ht_path = f'{directory}/gene_results.ht' print(f'Loading: {directory}/*.gene.txt ...') types = {f'Nmarker_MACCate_{i}': hl.tint32 for i in range(1, 9)} types.update({ x: hl.tfloat64 for x in ('Pvalue', 'Pvalue_Burden', 'Pvalue_SKAT', 'Pvalue_skato_NA', 'Pvalue_burden_NA', 'Pvalue_skat_NA') }) ht = hl.import_table(f'{directory}/*.gene.txt', delimiter=' ', impute=True, types=types) if n_cases == -1: n_cases = hl.null(hl.tint) if n_controls == -1: n_controls = hl.null(hl.tint) if heritability == -1.0: heritability = hl.null(hl.tfloat) if saige_version == 'NA': saige_version = hl.null(hl.tstr) if inv_normalized == 'NA': inv_normalized = hl.null(hl.tstr) fields = ht.Gene.split('_') gene_ht = hl.read_table(gene_ht_map_path).select('interval').distinct() ht = ht.key_by( gene_id=fields[0], gene_symbol=fields[1], annotation=fields[2], **pheno_key_dict).drop('Gene').naive_coalesce(10).annotate_globals( n_cases=n_cases, n_controls=n_controls, heritability=heritability, saige_version=saige_version, inv_normalized=inv_normalized) ht = ht.annotate(total_variants=hl.sum( [v for k, v in list(ht.row_value.items()) if 'Nmarker' in k]), interval=gene_ht.key_by('gene_id')[ht.gene_id].interval) ht = ht.checkpoint(output_ht_path, overwrite=overwrite, _read_if_exists=not overwrite).drop( 'n_cases', 'n_controls')
def filter_genotypes_ab(mt: hl.MatrixTable) -> hl.MatrixTable: """ Filter high-quality genotypes based on allelic-balance expression. Expected AD and GT in entries fields. Rules: hom_ref: ab <= 0.1 hets: 0.2 >= ab <= 0.8 hom_var: ab >= 0.9 :param mt: Input MT :return: Genotype-filtered MT """ ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.2) & (ab <= 0.8)) | (mt.GT.is_hom_var() & (ab >= 0.9))) return mt.filter_entries(filter_condition_ab, keep=True)
def all_and_leave_one_out(x, pop_array, all_f=hl.sum, loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)): """ Applies a function to an input array for all populations, and for each of leave-one-out populations. :param x: Input array :param pop_array: Population array :param all_f: Function for all populations. It takes the input array and returns a new value :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out population and the input array, and returns an array of new values. ... :return: Array of new values for all populations and for each of leave-one-out populations. :rtype: ArrayExpression """ arr = hl.array([all_f(x)]) arr = arr.extend(hl.map(lambda i: loo_f(i, x), hl.range(hl.len(pop_array)))) return hl.or_missing(hl.any(hl.is_defined, x), arr)
def _genotype_fields(self): # Convert the mt genotype entries into num_alt, gq, ab, dp, and sample_id. is_called = hl.is_defined(self.mt.GT) return { 'num_alt': hl.cond(is_called, self.mt.GT.n_alt_alleles(), -1), 'gq': hl.cond(is_called, self.mt.GQ, hl.null(hl.tint)), 'ab': hl.bind( lambda total: hl.cond( (is_called) & (total != 0) & (hl.len(self.mt.AD) > 1), hl.float(self.mt.AD[1] / total), hl.null(hl.tfloat)), hl.sum(self.mt.AD)), 'dp': hl.cond(is_called, hl.int(hl.min(self.mt.DP, 32000)), hl.null(hl.tfloat)), 'sample_id': self.mt.s }
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `call_rate` (``float32``) -- Fraction of samples with a defined `GT`. Equivalent to `n_called` / :meth:`.count_cols`. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') exprs = {} struct_exprs = [] def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype n_samples = mt.count_cols() if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) struct_exprs.append(hl.agg.call_stats(mt.GT, mt.alleles)) # the structure of this function makes it easy to add new nested computations def flatten_struct(*struct_exprs): flat = {} for struct in struct_exprs: for k, v in struct.items(): flat[k] = v return hl.struct( **flat, **exprs, ) mt = mt.annotate_rows(**{name: hl.bind(flatten_struct, *struct_exprs)}) hwe = hl.hardy_weinberg_test(mt[name].homozygote_count[0], mt[name].AC[1] - 2 * mt[name].homozygote_count[1], mt[name].homozygote_count[1]) hwe = hwe.select(het_freq_hwe=hwe.het_freq_hwe, p_value_hwe=hwe.p_value) mt = mt.annotate_rows(**{name: mt[name].annotate(n_not_called=n_samples - mt[name].n_called, call_rate=mt[name].n_called / n_samples, n_het=mt[name].n_called - hl.sum(mt[name].homozygote_count), n_non_ref=mt[name].n_called - mt[name].homozygote_count[0], **hl.cond(hl.len(mt.alleles) == 2, hwe, hl.null(hwe.dtype)))}) return mt
def fs_from_sb( sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression], normalize: bool = True, min_cell_count: int = 200, min_count: int = 4, min_p_value: float = 1e-320, ) -> hl.expr.Int64Expression: """ Computes `FS` (Fisher strand balance) annotation from the `SB` (strand balance table) field. `FS` is the phred-scaled value of the double-sided Fisher exact test on strand balance. Using default values will have the same behavior as the GATK implementation, that is: - If sum(counts) > 2*`min_cell_count` (default to GATK value of 200), they are normalized - If sum(counts) < `min_count` (default to GATK value of 4), returns missing - Any p-value < `min_p_value` (default to GATK value of 1e-320) is truncated to that value In addition to the default GATK behavior, setting `normalize` to `False` will perform a chi-squared test for large counts (> `min_cell_count`) instead of normalizing the cell values. .. note:: This function can either take - an array of length four containing the forward and reverse strands' counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] - a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java :param sb: Count of ref/alt reads on each strand :param normalize: Whether to normalize counts is sum(counts) > min_cell_count (normalize=True), or use a chi sq instead of FET (normalize=False) :param min_cell_count: Maximum count for performing a FET :param min_count: Minimum total count to output FS (otherwise null it output) :return: FS value """ if not isinstance(sb, hl.expr.ArrayNumericExpression): sb = hl.bind(lambda x: hl.flatten(x), sb) sb_sum = hl.bind(lambda x: hl.sum(x), sb) # Normalize table if counts get too large if normalize: fs_expr = hl.bind( lambda sb, sb_sum: hl.cond( sb_sum <= 2 * min_cell_count, sb, sb.map(lambda x: hl.int(x / (sb_sum / min_cell_count))), ), sb, sb_sum, ) # FET fs_expr = to_phred( hl.max( hl.fisher_exact_test( fs_expr[0], fs_expr[1], fs_expr[2], fs_expr[3] ).p_value, min_p_value, ) ) else: fs_expr = to_phred( hl.max( hl.contingency_table_test( sb[0], sb[1], sb[2], sb[3], min_cell_count=min_cell_count ).p_value, min_p_value, ) ) # Return null if counts <= `min_count` return hl.or_missing( sb_sum > min_count, hl.max(0, fs_expr) # Needed to avoid -0.0 values )
def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed struct field in the matrix, named based on the `name` parameter. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `call_rate` (``float64``) -- Fraction of calls non-missing. - `n_called` (``int64``) -- Number of non-missing calls. - `n_not_called` (``int64``) -- Number of missing calls. - `n_hom_ref` (``int64``) -- Number of homozygous reference calls. - `n_het` (``int64``) -- Number of heterozygous calls. - `n_hom_var` (``int64``) -- Number of homozygous alternate calls. - `n_non_ref` (``int64``) -- Sum of ``n_het`` and ``n_hom_var``. - `n_snp` (``int64``) -- Number of SNP alternate alleles. - `n_insertion` (``int64``) -- Number of insertion alternate alleles. - `n_deletion` (``int64``) -- Number of deletion alternate alleles. - `n_singleton` (``int64``) -- Number of private alleles. - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles. - `n_transversion` (``int64``) -- Number of transversion alternate alleles. - `n_star` (``int64``) -- Number of star (upstream deletion) alleles. - `r_ti_tv` (``float64``) -- Transition/Transversion ratio. - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio. - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio. Missing values ``NA`` may result from division by zero. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ require_row_key_variant(mt, 'sample_qc') from hail.expr.functions import _num_allele_type , _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows(**{variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))}) exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'sample_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref()) exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het()) exprs['n_singleton'] = hl.agg.sum(hl.sum(hl.range(0, mt['GT'].ploidy).map(lambda i: mt[variant_ac][mt['GT'][i]] == 1))) def get_allele_type(allele_idx): return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32)) exprs['allele_type_counts'] = hl.agg.explode( lambda elt: hl.agg.counter(elt), hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i]))) mt = mt.annotate_cols(**{name: hl.struct(**exprs)}) zero = hl.int64(0) select_exprs = {} if 'dp_stats' in exprs: select_exprs['dp_stats'] = mt[name].dp_stats if 'gq_stats' in exprs: select_exprs['gq_stats'] = mt[name].gq_stats select_exprs = { **select_exprs, 'call_rate': hl.float64(mt[name].n_called) / (mt[name].n_called + mt[name].n_not_called), 'n_called': mt[name].n_called, 'n_not_called': mt[name].n_not_called, 'n_hom_ref': mt[name].n_hom_ref, 'n_het': mt[name].n_het, 'n_hom_var': mt[name].n_called - mt[name].n_hom_ref - mt[name].n_het, 'n_non_ref': mt[name].n_called - mt[name].n_hom_ref, 'n_singleton': mt[name].n_singleton, 'n_snp': mt[name].allele_type_counts.get(allele_ints["Transition"], zero) + \ mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_insertion': mt[name].allele_type_counts.get(allele_ints["Insertion"], zero), 'n_deletion': mt[name].allele_type_counts.get(allele_ints["Deletion"], zero), 'n_transition': mt[name].allele_type_counts.get(allele_ints["Transition"], zero), 'n_transversion': mt[name].allele_type_counts.get(allele_ints["Transversion"], zero), 'n_star': mt[name].allele_type_counts.get(allele_ints["Star"], zero) } mt = mt.annotate_cols(**{name: mt[name].select(**select_exprs)}) mt = mt.annotate_cols(**{name: mt[name].annotate( r_ti_tv=divide_null(hl.float64(mt[name].n_transition), mt[name].n_transversion), r_het_hom_var=divide_null(hl.float64(mt[name].n_het), mt[name].n_hom_var), r_insertion_deletion=divide_null(hl.float64(mt[name].n_insertion), mt[name].n_deletion) )}) mt = mt.drop(variant_ac, variant_atypes) return mt
def n_discordant(counter): return hl.sum( hl.array(counter).filter(lambda tup: ~hl.literal( {i**2 for i in range(5)}).contains(tup[0])).map(lambda tup: tup[1]))
def create_binned_concordance(data_type: str, truth_sample: str, metric: str, nbins: int, overwrite: bool) -> None: """ Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric. :param str data_type: One 'exomes' or 'genomes' :param str truth_sample: Which truth sample concordance to load :param str metric: One of the evaluation metrics (or a RF hash) :param int nbins: Number of bins for the rank :param bool overwrite: Whether to overwrite existing table :return: Nothing -- just writes the table :rtype: None """ if hl.hadoop_exists( binned_concordance_path(data_type, truth_sample, metric) + '/_SUCCESS') and not overwrite: logger.warn( f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False" ) else: ht = hl.read_table( annotations_ht_path(data_type, f'{truth_sample}_concordance')) # Remove 1bp indels for syndip as cannot be trusted if truth_sample == 'syndip': ht = ht.filter( hl.is_indel(ht.alleles[0], ht.alleles[1]) & (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1), keep=False) high_conf_intervals = hl.import_locus_intervals( syndip_high_conf_regions_bed_path) else: high_conf_intervals = hl.import_locus_intervals( NA12878_high_conf_regions_bed_path) lcr = hl.import_locus_intervals(lcr_intervals_path) segdup = hl.import_locus_intervals(segdup_intervals_path) ht = ht.filter( hl.is_defined(high_conf_intervals[ht.locus]) & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus])) if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']: metric_ht = hl.read_table(score_ranking_path(data_type, metric)) else: metric_ht = hl.read_table( rf_path(data_type, 'rf_result', run_hash=metric)) metric_snvs, metrics_indels = metric_ht.aggregate([ hl.agg.count_where( hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])), hl.agg.count_where( ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])) ]) snvs, indels = ht.aggregate([ hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])), hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1])) ]) ht = ht.annotate_globals(global_counts=hl.struct( snvs=metric_snvs, indels=metrics_indels), counts=hl.struct(snvs=snvs, indels=indels)) ht = ht.annotate( snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), score=metric_ht[ht.key].score, global_rank=metric_ht[ht.key].rank, # TP => allele is found in both data sets n_tp=ht.concordance[3][3] + ht.concordance[3][4] + ht.concordance[4][3] + ht.concordance[4][4], # FP => allele is found only in test data set n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]), # FN => allele is found only in truth data set n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4]))) ht = add_rank(ht, -1.0 * ht.score) ht = ht.annotate(rank=[ hl.tuple([ 'global_rank', (ht.global_rank + 1) / hl.cond(ht.snv, ht.globals.global_counts.snvs, ht.globals.global_counts.indels) ]), hl.tuple([ 'truth_sample_rank', (ht.rank + 1) / hl.cond( ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels) ]) ]) ht = ht.explode(ht.rank) ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins)) ht = ht.group_by('rank_name', 'snv', 'bin').aggregate( # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons tp=hl.agg.count_where(ht.n_tp > 0), fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)), fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0) & (ht.n_fn > 0)), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count()).repartition(5) ht.write(binned_concordance_path(data_type, truth_sample, metric), overwrite=overwrite)
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value }))) return mt.annotate_rows(**{name: result})
def merge_sample_qc_expr( sample_qc_exprs: List[hl.expr.StructExpression], ) -> hl.expr.StructExpression: """ Create an expression that merges results from non-overlapping strata of hail.sample_qc. E.g.: - Compute autosomes and sex chromosomes metrics separately, then merge results - Compute bi-allelic and multi-allelic metrics separately, then merge results Note regarding the merging of ``dp_stats`` and ``gq_stats``: Because ``n`` is needed to aggregate ``stdev``, ``n_called`` is used for this purpose. This should work very well on a standard GATK VCF and it essentially assumes that: - samples that are called have `DP` and `GQ` fields - samples that are not called do not have `DP` and `GQ` fields Even if these assumptions are broken for some genotypes, it shouldn't matter too much. :param sample_qc_exprs: List of sample QC struct expressions for each stratification :return: Combined sample QC results """ # List of metrics that can be aggregated by summing additive_metrics = ([ "n_called", "n_not_called", "n_filtered", "n_hom_ref", "n_het", "n_hom_var", "n_non_ref", "n_snp", "n_insertion", "n_deletion", "n_singleton", "n_transition", "n_transversion", "n_star", "n_singleton_ti", "n_singleton_tv", ] + ["gq_over_" + f"{GQ}" for GQ in range(0, 70, 10)] + ["dp_over_" + f"{DP}" for DP in range(0, 40, 10)]) # List of metrics that are ratio of summed metrics (name, nominator, denominator) ratio_metrics = [ ("call_rate", "n_called", "n_not_called"), ("r_ti_tv", "n_transition", "n_transversion"), ("r_ti_tv_singleton", "n_singleton_ti", "n_singleton_tv"), ("r_het_hom_var", "n_het", "n_hom_var"), ("r_insertion_deletion", "n_insertion", "n_deletion"), ] # List of metrics that are struct generated by a stats counter stats_metrics = ["gq_stats", "dp_stats"] # Gather metrics present in sample qc fields sample_qc_fields = set(sample_qc_exprs[0]) for sample_qc_expr in sample_qc_exprs[1:]: sample_qc_fields = sample_qc_fields.union(set(sample_qc_expr)) # Merge additive metrics in sample qc fields merged_exprs = { metric: hl.sum([sample_qc_expr[metric] for sample_qc_expr in sample_qc_exprs]) for metric in additive_metrics if metric in sample_qc_fields } # Merge ratio metrics in sample qc fields merged_exprs.update({ metric: hl.float64(divide_null(merged_exprs[nom], merged_exprs[denom])) for metric, nom, denom in ratio_metrics if nom in sample_qc_fields and denom in sample_qc_fields }) # Merge stats counter metrics in sample qc fields # Use n_called as n for DP and GQ stats if "n_called" in sample_qc_fields: merged_exprs.update({ metric: merge_stats_counters_expr([ sample_qc_expr[metric].annotate(n=sample_qc_expr.n_called) for sample_qc_expr in sample_qc_exprs ]).drop("n") for metric in stats_metrics if metric in sample_qc_fields }) return hl.struct(**merged_exprs)
def same_hap_likelihood_expr(gt_counts, e: float = 1e-6, distance: int = None): """ ### Same haplotype model | Haplotype | Frequency | |-----------|:---------:| | aB | p | | Ab | e | | ab | q | | AB | 1-p-q-e | With: p >= q and p = 0 if in perfect LD. Therefore, we have the following frequencies: | v0/v1 | BB | Bb | bb | |-------|:---------------------:|:---------------------:|:-------------:| | AA | (1-p-q-e)<sup>2</sup> | 2*(1-p-q-e)*e | e<sup>2</sup> | | Ab | 2*(1-p-q-e)*p | 2*(p*e + (1-p-q-e)*q) | 2*q*e | | ab | p<sup>2<sup> | 2*p*q | q<sup>2</sup> | :param gt_counts: :param e: :param distance: :return: """ n = 2 * hl.sum(gt_counts) f1 = hl.sum(gt_counts[3:6] + 2 * hl.sum(gt_counts[6:])) / n f2 = (gt_counts[1] + gt_counts[4] + gt_counts[7] + 2 * (gt_counts[2] + gt_counts[5] + gt_counts[8])) / n p = hl.cond(f1 > f2, f1, f2) q = (gt_counts[4] + gt_counts[5] + gt_counts[7] + 2 * gt_counts[8]) / n x = 1 - p - q - e # Compute log-likelihoods def compute_same_hap_log_like(n, p, q, x): res = ( hl.cond( q > 0, hl.fold( lambda i, j: i + j[0] * j[1], 0.0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * e), hl.log10(e) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * e + x * q)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * q), hl.log10(q) * 2 ])), -1e31 # Very large negative value if no q is present )) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.97 - 0.03 * hl.log(distance + 1))) return res return hl.bind(compute_same_hap_log_like, n, p, q, x)
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10 ** (-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10 ** (-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10 ** (-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) de_novo_call = ( hl.case() .when(~het_hom_hom | kid_ad_fail, failure) .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)) .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)) .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)) .or_missing() ) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call) .rename({'__site_freq': 'prior'}))
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError( f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows( __site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref( ) & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum( tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10**(-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10**(-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10**(-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure).when( (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure).when( (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default(hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct( p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing())) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior)**4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return (hl.case().when(kid.GQ < min_gq, failure).when( (kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure).when((hl.sum(parent.AD) == 0), failure).when( parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure).when(p_de_novo < min_p, failure).when( ~is_snp, hl.case().when( (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct( p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')).when( (p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct( p_de_novo=p_de_novo, confidence='LOW')). or_missing()).default( hl.case().when( ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')).when( (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')). when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')).or_missing())) return hl.bind(solve, p_de_novo) de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when( autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when( hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when( hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)).or_missing()) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call).rename({'__site_freq': 'prior'}))
def test_annotate(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) self.assertTrue(kt.annotate()._same(kt)) result1 = convert_struct_to_dict(kt.annotate(foo=kt.a + 1, foo2=kt.a).take(1)[0]) self.assertDictEqual(result1, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'foo': 5, 'foo2': 4}) result3 = convert_struct_to_dict(kt.annotate( x1=kt.f.map(lambda x: x * 2), x2=kt.f.map(lambda x: [x, x + 1]).flatmap(lambda x: x), x3=hl.min(kt.f), x4=hl.max(kt.f), x5=hl.sum(kt.f), x6=hl.product(kt.f), x7=kt.f.length(), x8=kt.f.filter(lambda x: x == 3), x9=kt.f[1:], x10=kt.f[:], x11=kt.f[1:2], x12=kt.f.map(lambda x: [x, x + 1]), x13=kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flatmap(lambda x: x), x14=hl.cond(kt.a < kt.b, kt.c, hl.null(hl.tint32)), x15={1, 2, 3} ).take(1)[0]) self.assertDictEqual(result3, {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4], 'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3], 'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2], 'x12': [[1, 2], [2, 3], [3, 4]], 'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]], 'x14': None, 'x15': set([1, 2, 3])}) kt.annotate( x1=kt.a + 5, x2=5 + kt.a, x3=kt.a + kt.b, x4=kt.a - 5, x5=5 - kt.a, x6=kt.a - kt.b, x7=kt.a * 5, x8=5 * kt.a, x9=kt.a * kt.b, x10=kt.a / 5, x11=5 / kt.a, x12=kt.a / kt.b, x13=-kt.a, x14=+kt.a, x15=kt.a == kt.b, x16=kt.a == 5, x17=5 == kt.a, x18=kt.a != kt.b, x19=kt.a != 5, x20=5 != kt.a, x21=kt.a > kt.b, x22=kt.a > 5, x23=5 > kt.a, x24=kt.a >= kt.b, x25=kt.a >= 5, x26=5 >= kt.a, x27=kt.a < kt.b, x28=kt.a < 5, x29=5 < kt.a, x30=kt.a <= kt.b, x31=kt.a <= 5, x32=5 <= kt.a, x33=(kt.a == 0) & (kt.b == 5), x34=(kt.a == 0) | (kt.b == 5), x35=False, x36=True )
def compute_coverage_stats( mt: hl.MatrixTable, reference_ht: hl.Table, coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100], ) -> hl.Table: """ Computes the following coverage statistics for every base of the `reference_ht` provided: - mean - median - total DP - fraction of samples with coverage above X, for each x in `coverage_over_x_bins` The `reference_ht` is a table that contains row for each locus coverage should be computed on. It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`. The `reference_ht` can e.g. be created using `get_reference_ht` :param mt: Input sparse MT :param reference_ht: Input reference HT :param coverage_over_x_bins: List of boundaries for computing samples over X :return: Table with per-base coverage stats """ n_samples = mt.count_cols() print(f"Computing coverage stats on {n_samples} samples.") # Create an outer join with the reference Table mt = mt.select_entries("END", "DP").select_cols().select_rows() col_key_fields = list(mt.col_key) t = mt._localize_entries("__entries", "__cols") t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer") t = t.annotate( __entries=hl.or_else( t.__entries, hl.range(n_samples).map(lambda x: hl.null(t.__entries.dtype.element_type)), ) ) mt = t._unlocalize_entries("__entries", "__cols", col_key_fields) # Densify mt = hl.experimental.densify(mt) # Filter rows where the reference is missing mt = mt.filter_rows(mt._in_ref) # Unfilter entries so that entries with no ref block overlap aren't null mt = mt.unfilter_entries() # Compute coverage stats coverage_over_x_bins = sorted(coverage_over_x_bins) max_coverage_bin = coverage_over_x_bins[-1] hl_coverage_over_x_bins = hl.array(coverage_over_x_bins) # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin coverage_counter_expr = hl.agg.counter( hl.min(max_coverage_bin, hl.or_else(mt.DP, 0)) ) # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins # and computes the cumulative sum over them. # It needs to be in reverse order because we want the sum over samples covered by > X. count_array_expr = hl.cumulative_sum( hl.array( [ hl.int32(coverage_counter_expr.get(max_coverage_bin, 0)) ] # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin ).extend( # For each of the other bins, coverage needs to be summed between the boundaries hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1).map( lambda i: hl.sum( hl.range( hl_coverage_over_x_bins[i - 1], hl_coverage_over_x_bins[i] ).map(lambda j: hl.int32(coverage_counter_expr.get(j, 0))) ) ) ) ) mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0)) # Annotate rows now return mt.select_rows( mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr), median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0), total_DP=hl.agg.sum(mt.DP), **{ f"over_{x}": count_array_expr[i] / n_samples for i, x in zip( range( len(coverage_over_x_bins) - 1, -1, -1 ), # Reverse the bin index as count_array_expr has the reverse order coverage_over_x_bins, ) }, ).rows()
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind(hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when(hl.len(mt.alleles) == 2, hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0], e1.call_stats.AC[1] - 2 * e1.call_stats.homozygote_count[1], e1.call_stats.homozygote_count[1]) ).or_missing(), lambda hwe: hl.struct(**{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value}))) return mt.annotate_rows(**{name: result})