def generate_trio_stats( mt: hl.MatrixTable, autosomes_only: bool = True, bi_allelic_only: bool = True ) -> hl.Table: """ Default function to run `generate_trio_stats_expr` to get trio stats stratified by raw and adj .. note:: Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with a sparse MT `hl.experimental.densify` must be run first. By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense :param autosomes_only: If set, only autosomal intervals are used. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :return: Table with trio stats """ if autosomes_only: mt = filter_to_autosomes(mt) if bi_allelic_only: mt = mt.filter_rows(bi_allelic_expr(mt)) logger.info(f"Generating trio stats using {mt.count_cols()} trios.") trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj ht = mt.select_rows( **generate_trio_stats_expr( mt, transmitted_strata={"raw": True, "adj": trio_adj}, de_novo_strata={"raw": True, "adj": trio_adj}, ac_strata={"raw": True, "adj": trio_adj}, ) ).rows() return ht
def generate_fam_stats( mt: hl.MatrixTable, fam_file: str ) -> hl.Table: """ Calculate transmission and de novo mutation statistics using trios in the dataset. :param mt: Input MatrixTable :param fam_file: path to text file containing trio pedigree :return: Table containing trio stats """ # Load Pedigree data and filter MT to samples present in any of the trios ped = hl.Pedigree.read(fam_file, delimiter="\t") fam_ht = hl.import_fam(fam_file, delimiter="\t") fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id] ) fam_ht = fam_ht.explode('fam_members', name='s') fam_ht = fam_ht.key_by('s').select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.") mt = filter_to_autosomes(mt) mt = annotate_adj(mt) mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj') mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True) trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj) ht = mt.select_rows( **generate_trio_stats_expr( mt, transmitted_strata={ 'raw': True, 'adj': trio_adj }, de_novo_strata={ 'raw': True, 'adj': trio_adj, }, proband_is_female_expr=mt.is_female ) ).rows() return ht.filter( ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0 )
def generate_fam_stats(mt: hl.MatrixTable, fam_file: str) -> hl.Table: # Load Pedigree data and filter MT to samples present in any of the trios ped = hl.Pedigree.read(fam_file, delimiter="\t") fam_ht = hl.import_fam(fam_file, delimiter="\t") fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) fam_ht = fam_ht.explode('fam_members', name='s') fam_ht = fam_ht.key_by('s').select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) logger.info( f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios." ) mt = filter_to_autosomes(mt) mt = annotate_adj(mt) mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj') mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True) trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj) parents_no_alt = (mt.mother_entry.AD[1] == 0) & (mt.father_entry.AD[1] == 0) parents_high_depth = (mt.mother_entry.AD[0] + mt.mother_entry.AD[1] > 20) & (mt.father_entry.AD[0] + mt.father_entry.AD[1] > 20) parents_high_gq = (mt.mother_entry.GQ >= 30) & (mt.father_entry.GQ >= 30) ht = mt.select_rows(**generate_trio_stats_expr( mt, transmitted_strata={ 'raw': None, 'adj': trio_adj }, de_novo_strata={ 'raw': None, 'adj': trio_adj, 'hq': trio_adj & parents_high_gq & parents_high_depth & parents_no_alt }, proband_is_female_expr=mt.is_female)).rows() return ht.filter( ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0)