def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. Requires a field `filtered` which type should be the same as the input duplicated samples Table key. :param dups_ht: Input HT :return: Flattened HT """ def get_dups_to_keep_expr(): if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype: return (dups_ht.key, False) elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type == dups_ht.key[0].dtype): return (dups_ht.key[0], False) else: raise TypeError( f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible." ) dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend( dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode("dups") dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by("s")
def prepare_exomes(exome_ht: hl.Table, groupings: List, impose_high_af_cutoff_upfront: bool = True) -> hl.Table: # Manipulate VEP annotations and explode by them exome_ht = add_most_severe_csq_to_tc_within_ht(exome_ht) exome_ht = exome_ht.transmute(transcript_consequences=exome_ht.vep.transcript_consequences) exome_ht = exome_ht.explode(exome_ht.transcript_consequences) # Annotate variants with grouping variables. exome_ht, grouping = annotate_constraint_groupings(exome_ht,groupings) # This function needs to be adapted exome_ht = exome_ht.select( 'context', 'ref', 'alt', 'methylation_level', 'freq', 'pass_filters', *groupings) # Filter by allele count # Likely to need to adapt this function as well af_cutoff = 0.001 freq_index = exome_ht.freq_index_dict.collect()[0][dataset] def keep_criteria(ht): crit = (ht.freq[freq_index].AC > 0) & ht.pass_filters & (ht.coverage > 0) if impose_high_af_cutoff_upfront: crit &= (ht.freq[freq_index].AF <= af_cutoff) return crit exome_ht = exome_ht.filter(keep_criteria(exome_ht)) return exome_ht
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table: ht = ht.transmute(phase_info=hl.array(ht.phase_info)) ht = ht.explode('phase_info') ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1]) if remove_all_ref: ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0) return ht
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Flattens the result of `filter_duplicate_samples`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. :param dups_ht: Input HT :return: Flattened HT """ dups_ht = dups_ht.annotate(dups=hl.array([( dups_ht.key, False)]).extend(dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode('dups') dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by('s')
def compute_grouped_binned_ht( bin_ht: hl.Table, checkpoint_path: Optional[str] = None, ) -> hl.GroupedTable: """ Group a Table that has been annotated with bins (`compute_ranked_bin` or `create_binned_ht`). The table will be grouped by bin_id (bin, biallelic, etc.), contig, snv, bi_allelic and singleton. .. note:: If performing an aggregation following this grouping (such as `score_bin_agg`) then the aggregation function will need to use `ht._parent` to get the origin Table from the GroupedTable for the aggregation :param bin_ht: Input Table with a `bin_id` annotation :param checkpoint_path: If provided an intermediate checkpoint table is created with all required annotations before shuffling. :return: Table grouped by bins(s) """ # Explode the rank table by bin_id bin_ht = bin_ht.annotate( bin_groups=hl.array( [ hl.Struct(bin_id=bin_name, bin=bin_ht[bin_name]) for bin_name in bin_ht.bin_group_variant_counts ] ) ) bin_ht = bin_ht.explode(bin_ht.bin_groups) bin_ht = bin_ht.transmute( bin_id=bin_ht.bin_groups.bin_id, bin=bin_ht.bin_groups.bin ) bin_ht = bin_ht.filter(hl.is_defined(bin_ht.bin)) if checkpoint_path is not None: bin_ht.checkpoint(checkpoint_path, overwrite=True) else: bin_ht = bin_ht.persist() # Group by bin_id, bin and additional stratification desired and compute QC metrics per bin return bin_ht.group_by( bin_id=bin_ht.bin_id, contig=bin_ht.locus.contig, snv=hl.is_snp(bin_ht.alleles[0], bin_ht.alleles[1]), bi_allelic=~bin_ht.was_split, singleton=bin_ht.singleton, release_adj=bin_ht.ac > 0, bin=bin_ht.bin, )._set_buffer_size(20000)
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable: """ Filters a MatrixTable to a set of trios in `fam_ht`, filters to autosomes, and annotates with adj. :param mt: A Matrix Table to filter to only trios :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam` :return: A MT filtered to trios and adj annotated """ # Filter MT to samples present in any of the trios fam_ht = fam_ht.annotate(fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) fam_ht = fam_ht.explode("fam_members", name="s") fam_ht = fam_ht.key_by("s").select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) mt = filter_to_autosomes(mt) mt = annotate_adj(mt) return mt
def rank_related_samples( relatedness_ht: hl.Table, meta_ht: hl.Table, sample_qc_ht: hl.Table, fam_ht: hl.Table ) -> Tuple[hl.Table, Callable[[hl.expr.Expression, hl.expr.Expression], hl.expr.NumericExpression]]: # Load families and identify parents from cases as they will be thrown away anyways fam_ht = fam_ht.transmute(trio=[ hl.struct(s=fam_ht.id, is_parent=False), hl.struct(s=fam_ht.pat_id, is_parent=True), hl.struct(s=fam_ht.mat_id, is_parent=True) ]) fam_ht = fam_ht.explode(fam_ht.trio) fam_ht = fam_ht.key_by(s=fam_ht.trio.s) case_parents = fam_ht.filter(meta_ht[fam_ht.key].is_case & fam_ht.trio.is_parent) def annotate_related_pairs(related_pairs: hl.Table, index_col: str) -> hl.Table: related_pairs = related_pairs.key_by(**related_pairs[index_col]) related_pairs = related_pairs.filter( hl.is_missing(case_parents[related_pairs.key])) return related_pairs.annotate( **{ index_col: related_pairs[index_col].annotate( case_rank=hl.or_else( hl.int(meta_ht[related_pairs.key].is_case), -1), dp_mean=hl.or_else( sample_qc_ht[ related_pairs.key].sample_qc.dp_stats.mean, -1.0)) }).key_by() relatedness_ht = annotate_related_pairs(relatedness_ht, "i") relatedness_ht = annotate_related_pairs(relatedness_ht, "j") def tie_breaker(l, r): return (hl.case().when(l.case_rank != r.case_rank, r.case_rank - l.case_rank) # smaller is better .default(l.dp_mean - r.dp_mean) # larger is better ) return relatedness_ht, tie_breaker
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: # Count variants for ranking count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp( ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')} rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}") ht_truth_data = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht") ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) ht = ht.annotate( **ht_truth_data[ht.key], # **fam_ht[ht.key], # **gnomad_ht[ht.key], # **denovo_ht[ht.key], # clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()), rank_bins=hl.array( [hl.Struct( rank_id=rank_name, bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins))) ) for rank_name in rank_variant_counts] ), # lcr=hl.is_defined(lcr_intervals[ht.locus]) ) ht = ht.explode(ht.rank_bins) ht = ht.transmute( rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin ) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True) # Create binned data return ( ht .group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.transmitted_singleton, trans_singletons=hl.is_defined(ht.singleton_rank), de_novo_high_quality=ht.de_novo_high_quality_rank, de_novo_medium_quality=hl.is_defined( ht.de_novo_medium_quality_rank), de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank), # release_adj=ht.ac > 0, bin=ht.bin )._set_buffer_size(20000) .aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition( ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion( ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), # n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.transmitted_singleton), n_high_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.99), n_validated_DDD_denovos=hl.agg.count_where( ht.inheritance.contains("De novo")), n_medium_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.5), n_high_confidence_de_novos=hl.agg.count_where( ht.de_novo_data.confidence[0] == 'HIGH'), n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum( ht.family_stats.mendel[0].errors)), n_high_quality_de_novos_synonymous=hl.agg.count_where( (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")), # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where( ht.family_stats.mendel[0].errors > 0)), # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)), n_omni=hl.agg.count_where(ht.omni), n_mills=hl.agg.count_where(ht.mills), n_hapmap=hl.agg.count_where(ht.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.kgp_phase1_hc), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site) ) )
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
def compute_binned_truth_sample_concordance( ht: hl.Table, binned_score_ht: hl.Table, n_bins: int = 100, add_bins: Dict[str, hl.expr.BooleanExpression] = {}, ) -> hl.Table: """ Determine the concordance (TP, FP, FN) between a truth sample within the callset and the samples truth data grouped by bins computed using `compute_ranked_bin`. .. note:: The input 'ht` should contain three row fields: - score: value to use for binning - GT: a CallExpression containing the genotype of the evaluation data for the sample - truth_GT: a CallExpression containing the genotype of the truth sample The input `binned_score_ht` should contain: - score: value used to bin the full callset - bin: the full callset bin 'add_bins` can be used to add additional global and truth sample binning to the final binned truth sample concordance HT. The keys in `add_bins` must be present in `binned_score_ht` and the values in `add_bins` should be expressions on `ht` that define a subset of variants to bin in the truth sample. An example is if we want to look at the global and truth sample binning on only bi-allelic variants. `add_bins` could be set to {'biallelic_bin': ht.biallelic}. The table is grouped by global/truth sample bin and variant type and contains TP, FP and FN. :param ht: Input HT :param binned_score_ht: Table with the bin annotation for each variant :param n_bins: Number of bins to bin the data into :param add_bins: Dictionary of additional global bin columns (key) and the expr to use for binning the truth sample (value) :return: Binned truth sample concordance HT """ # Annotate score and global bin indexed_binned_score_ht = binned_score_ht[ht.key] ht = ht.annotate( **{ f"global_{bin_id}": indexed_binned_score_ht[bin_id] for bin_id in add_bins }, **{f"_{bin_id}": bin_expr for bin_id, bin_expr in add_bins.items()}, score=indexed_binned_score_ht.score, global_bin=indexed_binned_score_ht.bin, ) # Annotate the truth sample bin bin_ht = compute_ranked_bin( ht, score_expr=ht.score, bin_expr={ "truth_sample_bin": hl.expr.bool(True), **{ f"truth_sample_{bin_id}": ht[f"_{bin_id}"] for bin_id in add_bins }, }, n_bins=n_bins, ) ht = ht.join(bin_ht, how="left") bin_list = [ hl.tuple(["global_bin", ht.global_bin]), hl.tuple(["truth_sample_bin", ht.truth_sample_bin]), ] bin_list.extend([ hl.tuple([f"global_{bin_id}", ht[f"global_{bin_id}"]]) for bin_id in add_bins ]) bin_list.extend([ hl.tuple([f"truth_sample_{bin_id}", ht[f"truth_sample_{bin_id}"]]) for bin_id in add_bins ]) # Explode the global and truth sample bins ht = ht.annotate(bin=bin_list) ht = ht.explode(ht.bin) ht = ht.annotate(bin_id=ht.bin[0], bin=hl.int(ht.bin[1])) # Compute TP, FP and FN by bin_id, variant type and bin return (ht.group_by("bin_id", "snv", "bin").aggregate( # TP => allele is found in both data sets tp=hl.agg.count_where(ht.GT.is_non_ref() & ht.truth_GT.is_non_ref()), # FP => allele is found only in test data set fp=hl.agg.count_where(ht.GT.is_non_ref() & hl.or_else(ht.truth_GT.is_hom_ref(), True)), # FN => allele is found in truth data only fn=hl.agg.count_where( hl.or_else(ht.GT.is_hom_ref(), True) & ht.truth_GT.is_non_ref()), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count(), ).repartition(5))
def generate_sib_stats_expr( mt: hl.MatrixTable, sib_ht: hl.Table, i_col: str = "i", j_col: str = "j", strata: Dict[str, hl.expr.BooleanExpression] = {"raw": True}, is_female: Optional[hl.expr.BooleanExpression] = None, ) -> hl.expr.StructExpression: """ Generates a row-wise expression containing the number of alternate alleles in common between sibling pairs. The sibling sharing counts can be stratified using additional filters using `stata`. .. note:: This function expects that the `mt` has either been split or filtered to only bi-allelics If a sample has multiple sibling pairs, only one pair will be counted :param mt: Input matrix table :param sib_ht: Table defining sibling pairs with one sample in a col (`i_col`) and the second in another col (`j_col`) :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param strata: Dict with additional strata to use when computing shared sibling variant counts :param is_female: An optional column in mt giving the sample sex. If not given, counts are only computed for autosomes. :return: A Table with the sibling shared variant counts """ def _get_alt_count(locus, gt, is_female): """ Helper method to calculate alt allele count with sex info if present """ if is_female is None: return hl.or_missing(locus.in_autosome(), gt.n_alt_alleles()) return (hl.case().when( locus.in_autosome_or_par(), gt.n_alt_alleles()).when( ~is_female & (locus.in_x_nonpar() | locus.in_y_nonpar()), hl.min(1, gt.n_alt_alleles()), ).when(is_female & locus.in_y_nonpar(), 0).default(0)) if is_female is None: logger.warning( "Since no sex expression was given to generate_sib_stats_expr, only variants in autosomes will be counted." ) # If a sample is in sib_ht more than one time, keep only one of the sibling pairs # First filter to only samples found in mt to keep as many pairs as possible s_to_keep = mt.aggregate_cols(hl.agg.collect_as_set(mt.s), _localize=False) sib_ht = sib_ht.filter( s_to_keep.contains(sib_ht[i_col].s) & s_to_keep.contains(sib_ht[j_col].s)) sib_ht = sib_ht.add_index("sib_idx") sib_ht = sib_ht.annotate(sibs=[sib_ht[i_col].s, sib_ht[j_col].s]) sib_ht = sib_ht.explode("sibs") sib_ht = sib_ht.group_by("sibs").aggregate( sib_idx=(hl.agg.take(sib_ht.sib_idx, 1, ordering=sib_ht.sib_idx)[0])) sib_ht = sib_ht.group_by( sib_ht.sib_idx).aggregate(sibs=hl.agg.collect(sib_ht.sibs)) sib_ht = sib_ht.filter(hl.len(sib_ht.sibs) == 2).persist() logger.info( f"Generating sibling variant sharing counts using {sib_ht.count()} pairs." ) sib_ht = sib_ht.explode("sibs").key_by("sibs")[mt.s] # Create sibling sharing counters sib_stats = hl.struct( **{ f"n_sib_shared_variants_{name}": hl.sum( hl.agg.filter( expr, hl.agg.group_by( sib_ht.sib_idx, hl.or_missing( hl.agg.sum(hl.is_defined(mt.GT)) == 2, hl.agg.min( _get_alt_count(mt.locus, mt.GT, is_female)), ), ), ).values()) for name, expr in strata.items() }) sib_stats = sib_stats.annotate( **{ f"ac_sibs_{name}": hl.agg.filter( expr & hl.is_defined(sib_ht.sib_idx), hl.agg.sum(mt.GT.n_alt_alleles())) for name, expr in strata.items() }) return sib_stats
def compute_binned_truth_sample_concordance(ht: hl.Table, binned_score_ht: hl.Table, n_bins: int = 100) -> hl.Table: """ Determines the concordance (TP, FP, FN) between a truth sample within the callset and the samples truth data grouped by bins computed using `compute_quantile_bin`. .. note:: The input 'ht` should contain three row fields: - score: value to use for quantile binning - GT: a CallExpression containing the genotype of the evaluation data for the sample - truth_GT: a CallExpression containing the genotype of the truth sample The input `binned_score_ht` should contain: - score: value used to bin the full callset - bin: the full callset quantile bin The table is grouped by global/truth sample bin and variant type and contains TP, FP and FN. :param ht: Input HT :param binned_score_ht: Table with the an annotation for quantile bin for each variant :param n_bins: Number of bins to bin the data into :return: Binned truth sample concordance HT """ # Annotate score and global bin indexed_binned_score_ht = binned_score_ht[ht.key] ht = ht.annotate(score=indexed_binned_score_ht.score, global_bin=indexed_binned_score_ht.bin) # Annotate the truth sample quantile bin bin_ht = compute_quantile_bin( ht, score_expr=ht.score, bin_expr={"truth_sample_bin": hl.expr.bool(True)}, n_bins=n_bins, ) ht = ht.join(bin_ht, how="left") # Explode the global and truth sample bins ht = ht.annotate(bin=[ hl.tuple(["global_bin", ht.global_bin]), hl.tuple(["truth_sample_bin", ht.truth_sample_bin]), ]) ht = ht.explode(ht.bin) ht = ht.annotate(bin_id=ht.bin[0], bin=hl.int(ht.bin[1])) # Compute TP, FP and FN by bin_id, variant type and bin return (ht.group_by("bin_id", "snv", "bin").aggregate( # TP => allele is found in both data sets tp=hl.agg.count_where(ht.GT.is_non_ref() & ht.truth_GT.is_non_ref()), # FP => allele is found only in test data set fp=hl.agg.count_where(ht.GT.is_non_ref() & hl.or_else(ht.truth_GT.is_hom_ref(), True)), # FN => allele is found in truth data only fn=hl.agg.count_where(ht.GT.is_hom_ref() & hl.or_else(ht.truth_GT.is_non_ref(), True)), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count(), ).repartition(5))