def _is_dnm( proband_gt: hl.expr.CallExpression, father_gt: hl.expr.CallExpression, mother_gt: hl.expr.CallExpression, locus: hl.expr.LocusExpression, proband_is_female: Optional[hl.expr.BooleanExpression], ) -> hl.expr.BooleanExpression: """ Helper method to get whether a given genotype combination is a DNM at a given locus with a given proband sex. """ if proband_is_female is None: logger.warning( "Since no proband sex expression was given to generate_trio_stats_expr, only DNMs in autosomes will be counted." ) return hl.or_missing( locus.in_autosome(), proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), ) return hl.cond( locus.in_autosome_or_par() | (proband_is_female & locus.in_x_nonpar()), proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), hl.or_missing(~proband_is_female, proband_gt.is_hom_var() & father_gt.is_hom_ref()), )
def _get_copy_state( locus: hl.expr.LocusExpression) -> hl.expr.Int32Expression: """ Helper method to go from LocusExpression to a copy-state int for indexing into the trans_count_map. """ return (hl.case().when(locus.in_autosome_or_par(), auto_or_par).when( locus.in_x_nonpar(), hemi_x).when(locus.in_y_nonpar(), hemi_y).or_missing())
def faf_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, locus: hl.expr.LocusExpression, pops_to_exclude: Optional[Set[str]] = None, faf_thresholds: List[float] = [0.95, 0.99], ) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]: """ Calculates the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`. See http://cardiodb.org/allelefrequencyapp/ for more information. The FAF is computed for each of the following population stratification if found in `freq_meta`: - All samples, with adj criteria - For each population, with adj criteria - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes) Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64. This returns a tuple with two expressions: 1. An array of FAF expressions as described above 2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`. :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats) :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq) :param locus: locus :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations) :param faf_thresholds: List of FAF thresholds to compute :return: (FAF expression, FAF metadata) """ _pops_to_exclude = ( hl.literal(pops_to_exclude) if pops_to_exclude is not None else {} ) # pylint: disable=invalid-unary-operand-type faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & ( (freq_meta[i].size() == 1) | ( (hl.set(freq_meta[i].keys()) == {"pop", "group"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"])) ) ) ) sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & (freq_meta[i].contains("sex")) & ( (freq_meta[i].size() == 2) | ( (hl.set(freq_meta[i].keys()) == {"pop", "group", "sex"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"])) ) ) ) faf_expr = faf_freq_indices.map( lambda i: hl.struct( **{ f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( freq[i].AC, freq[i].AN, threshold ) for threshold in faf_thresholds } ) ) faf_expr = faf_expr.extend( sex_faf_freq_indices.map( lambda i: hl.or_missing( ~locus.in_autosome_or_par(), hl.struct( **{ f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( freq[i].AC, freq[i].AN, threshold ) for threshold in faf_thresholds } ), ) ) ) faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map(lambda i: freq_meta[i]) return faf_expr, hl.eval(faf_meta)
def get_summary_counts_dict( locus_expr: hl.expr.LocusExpression, allele_expr: hl.expr.ArrayExpression, lof_expr: hl.expr.StringExpression, no_lof_flags_expr: hl.expr.BooleanExpression, most_severe_csq_expr: hl.expr.StringExpression, prefix_str: str = "", ) -> Dict[str, hl.expr.Int64Expression]: """ Return dictionary containing containing counts of multiple variant categories. Categories are: - Number of variants - Number of indels - Number of SNVs - Number of LoF variants - Number of LoF variants that pass LOFTEE - Number of LoF variants that pass LOFTEE without any flgs - Number of LoF variants annotated as 'other splice' (OS) by LOFTEE - Number of LoF variants that fail LOFTEE - Number of missense variants - Number of synonymous variants - Number of autosomal variants - Number of allosomal variants .. warning:: Assumes `allele_expr` contains only two variants (multi-allelics have been split). :param locus_expr: LocusExpression. :param allele_expr: ArrayExpression containing alleles. :param lof_expr: StringExpression containing LOFTEE annotation. :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags. :param most_severe_csq_expr: StringExpression containing most severe consequence annotation. :param prefix_str: Desired prefix string for category names. Default is empty str. :return: Dict of categories and counts per category. """ logger.warning( "This function expects that multi-allelic variants have been split!") return { f"{prefix_str}num_variants": hl.agg.count(), f"{prefix_str}indels": hl.agg.count_where(hl.is_indel(allele_expr[0], allele_expr[1])), f"{prefix_str}snps": hl.agg.count_where(hl.is_snp(allele_expr[0], allele_expr[1])), f"{prefix_str}LOF": hl.agg.count_where(hl.is_defined(lof_expr)), f"{prefix_str}pass_loftee": hl.agg.count_where(lof_expr == "HC"), f"{prefix_str}pass_loftee_no_flag": hl.agg.count_where((lof_expr == "HC") & (no_lof_flags_expr)), f"{prefix_str}loftee_os": hl.agg.count_where(lof_expr == "OS"), f"{prefix_str}fail_loftee": hl.agg.count_where(lof_expr == "LC"), f"{prefix_str}num_missense": hl.agg.count_where(most_severe_csq_expr == "missense_variant"), f"{prefix_str}num_synonymous": hl.agg.count_where(most_severe_csq_expr == "synonymous_variant"), f"{prefix_str}num_autosomal_variants": hl.agg.filter(locus_expr.in_autosome_or_par(), hl.agg.count()), f"{prefix_str}num_allosomal_variants": hl.agg.filter(locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(), hl.agg.count()), }