def adjusted_sex_ploidy_expr( locus_expr: hl.expr.LocusExpression, gt_expr: hl.expr.CallExpression, karyotype_expr: hl.expr.StringExpression, xy_karyotype_str: str = "XY", xx_karyotype_str: str = "XX", ) -> hl.expr.CallExpression: """ Creates an entry expression to convert males to haploid on non-PAR X/Y and females to missing on Y :param locus_expr: Locus :param gt_expr: Genotype :param karyotype_expr: Karyotype :param xy_karyotype_str: Male sex karyotype representation :param xx_karyotype_str: Female sex karyotype representation :return: Genotype adjusted for sex ploidy """ male = karyotype_expr == xy_karyotype_str female = karyotype_expr == xx_karyotype_str x_nonpar = locus_expr.in_x_nonpar() y_par = locus_expr.in_y_par() y_nonpar = locus_expr.in_y_nonpar() return (hl.case(missing_false=True).when( female & (y_par | y_nonpar), hl.null(hl.tcall)).when( male & (x_nonpar | y_nonpar) & gt_expr.is_het(), hl.null(hl.tcall)).when(male & (x_nonpar | y_nonpar), hl.call(gt_expr[0], phased=False)).default(gt_expr))
def _is_dnm( proband_gt: hl.expr.CallExpression, father_gt: hl.expr.CallExpression, mother_gt: hl.expr.CallExpression, locus: hl.expr.LocusExpression, proband_is_female: Optional[hl.expr.BooleanExpression], ) -> hl.expr.BooleanExpression: """ Helper method to get whether a given genotype combination is a DNM at a given locus with a given proband sex. """ if proband_is_female is None: logger.warning( "Since no proband sex expression was given to generate_trio_stats_expr, only DNMs in autosomes will be counted." ) return hl.or_missing( locus.in_autosome(), proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), ) return hl.cond( locus.in_autosome_or_par() | (proband_is_female & locus.in_x_nonpar()), proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), hl.or_missing(~proband_is_female, proband_gt.is_hom_var() & father_gt.is_hom_ref()), )
def _get_copy_state( locus: hl.expr.LocusExpression) -> hl.expr.Int32Expression: """ Helper method to go from LocusExpression to a copy-state int for indexing into the trans_count_map. """ return (hl.case().when(locus.in_autosome_or_par(), auto_or_par).when( locus.in_x_nonpar(), hemi_x).when(locus.in_y_nonpar(), hemi_y).or_missing())
def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) )
def hemi_expr( locus: hl.expr.LocusExpression, sex_expr: hl.expr.StringExpression, gt: hl.expr.CallExpression, male_str: str = "XY", ) -> hl.expr.BooleanExpression: """ Return whether genotypes are hemizygous. Return missing expression if locus is not in chrX/chrY non-PAR regions. :param locus: Input locus. :param sex_expr: Input StringExpression indicating whether sample is XX or XY. :param gt: Input genotype. :param xy_str: String indicating whether sample is XY. Default is "XY". :return: BooleanExpression indicating whether genotypes are hemizygous. """ return hl.or_missing( locus.in_x_nonpar() | locus.in_y_nonpar(), # Haploid genotypes have a single integer, so checking if # mt.GT[0] is alternate allele gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), )
def faf_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, locus: hl.expr.LocusExpression, pops_to_exclude: Optional[Set[str]] = None, faf_thresholds: List[float] = [0.95, 0.99], ) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]: """ Calculates the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`. See http://cardiodb.org/allelefrequencyapp/ for more information. The FAF is computed for each of the following population stratification if found in `freq_meta`: - All samples, with adj criteria - For each population, with adj criteria - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes) Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64. This returns a tuple with two expressions: 1. An array of FAF expressions as described above 2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`. :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats) :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq) :param locus: locus :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations) :param faf_thresholds: List of FAF thresholds to compute :return: (FAF expression, FAF metadata) """ _pops_to_exclude = ( hl.literal(pops_to_exclude) if pops_to_exclude is not None else {} ) # pylint: disable=invalid-unary-operand-type faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & ( (freq_meta[i].size() == 1) | ( (hl.set(freq_meta[i].keys()) == {"pop", "group"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"])) ) ) ) sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & (freq_meta[i].contains("sex")) & ( (freq_meta[i].size() == 2) | ( (hl.set(freq_meta[i].keys()) == {"pop", "group", "sex"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"])) ) ) ) faf_expr = faf_freq_indices.map( lambda i: hl.struct( **{ f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( freq[i].AC, freq[i].AN, threshold ) for threshold in faf_thresholds } ) ) faf_expr = faf_expr.extend( sex_faf_freq_indices.map( lambda i: hl.or_missing( ~locus.in_autosome_or_par(), hl.struct( **{ f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( freq[i].AC, freq[i].AN, threshold ) for threshold in faf_thresholds } ), ) ) ) faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map(lambda i: freq_meta[i]) return faf_expr, hl.eval(faf_meta)
def get_summary_counts_dict( locus_expr: hl.expr.LocusExpression, allele_expr: hl.expr.ArrayExpression, lof_expr: hl.expr.StringExpression, no_lof_flags_expr: hl.expr.BooleanExpression, most_severe_csq_expr: hl.expr.StringExpression, prefix_str: str = "", ) -> Dict[str, hl.expr.Int64Expression]: """ Return dictionary containing containing counts of multiple variant categories. Categories are: - Number of variants - Number of indels - Number of SNVs - Number of LoF variants - Number of LoF variants that pass LOFTEE - Number of LoF variants that pass LOFTEE without any flgs - Number of LoF variants annotated as 'other splice' (OS) by LOFTEE - Number of LoF variants that fail LOFTEE - Number of missense variants - Number of synonymous variants - Number of autosomal variants - Number of allosomal variants .. warning:: Assumes `allele_expr` contains only two variants (multi-allelics have been split). :param locus_expr: LocusExpression. :param allele_expr: ArrayExpression containing alleles. :param lof_expr: StringExpression containing LOFTEE annotation. :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags. :param most_severe_csq_expr: StringExpression containing most severe consequence annotation. :param prefix_str: Desired prefix string for category names. Default is empty str. :return: Dict of categories and counts per category. """ logger.warning( "This function expects that multi-allelic variants have been split!") return { f"{prefix_str}num_variants": hl.agg.count(), f"{prefix_str}indels": hl.agg.count_where(hl.is_indel(allele_expr[0], allele_expr[1])), f"{prefix_str}snps": hl.agg.count_where(hl.is_snp(allele_expr[0], allele_expr[1])), f"{prefix_str}LOF": hl.agg.count_where(hl.is_defined(lof_expr)), f"{prefix_str}pass_loftee": hl.agg.count_where(lof_expr == "HC"), f"{prefix_str}pass_loftee_no_flag": hl.agg.count_where((lof_expr == "HC") & (no_lof_flags_expr)), f"{prefix_str}loftee_os": hl.agg.count_where(lof_expr == "OS"), f"{prefix_str}fail_loftee": hl.agg.count_where(lof_expr == "LC"), f"{prefix_str}num_missense": hl.agg.count_where(most_severe_csq_expr == "missense_variant"), f"{prefix_str}num_synonymous": hl.agg.count_where(most_severe_csq_expr == "synonymous_variant"), f"{prefix_str}num_autosomal_variants": hl.agg.filter(locus_expr.in_autosome_or_par(), hl.agg.count()), f"{prefix_str}num_allosomal_variants": hl.agg.filter(locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(), hl.agg.count()), }
def phase_by_transmission( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """Phases genotype calls in a trio based allele transmission. Notes ----- In the phased calls returned, the order is as follows: - Proband: father_allele | mother_allele - Parents: transmitted_allele | untransmitted_allele Phasing of sex chromosomes: - Sex chromosomes of male individuals should be haploid to be phased correctly. - If `proband_call` is diploid on non-par regions of the sex chromosomes, it is assumed to be female. Returns `NA` when genotype calls cannot be phased. The following genotype calls combinations cannot be phased by transmission: 1. One of the calls in the trio is missing 2. The proband genotype cannot be obtained from the parents alleles (Mendelian violation) 3. All individuals of the trio are heterozygous for the same two alleles 4. Father is diploid on non-PAR region of X or Y 5. Proband is diploid on non-PAR region of Y In addition, individual phased genotype calls are returned as missing in the following situations: 1. All mother genotype calls non-PAR region of Y 2. Diploid father genotype calls on non-PAR region of X for a male proband (proband and mother are still phased as father doesn't participate in allele transmission) Note ---- :meth:`.experimental.phase_trio_matrix_by_transmission` provides a convenience wrapper for phasing a trio matrix. Parameters ---------- locus : :class:`.LocusExpression` Expression for the locus in the trio matrix alleles : :class:`.ArrayExpression` Expression for the alleles in the trio matrix proband_call : :class:`.CallExpression` Expression for the proband call in the trio matrix father_call : :class:`.CallExpression` Expression for the father call in the trio matrix mother_call : :class:`.CallExpression` Expression for the mother call in the trio matrix Returns ------- :class:`.ArrayExpression` Array containing: [phased proband call, phased father call, phased mother call]""" def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Get the set of all different one-hot-encoded allele-vectors in a genotype call. It is returned as an ordered array where the first vector corresponds to the first allele, and the second vector (only present if het) the second allele. :param CallExpression call: genotype :param ArrayExpression alleles: Alleles at the site :return: Array of one-hot-encoded alleles :rtype: ArrayExpression """ return hl.cond( call.is_het(), hl.array([ hl.call(call[0]).one_hot_alleles(alleles), hl.call(call[1]).one_hot_alleles(alleles), ]), hl.array([hl.call(call[0]).one_hot_alleles(alleles)]) ) def phase_parent_call(call: hl.expr.CallExpression, transmitted_allele_index: int): """ Given a genotype and which allele was transmitted to the offspring, returns the parent phased genotype. :param CallExpression call: Parent genotype :param int transmitted_allele_index: index of transmitted allele (0 or 1) :return: Phased parent genotype :rtype: CallExpression """ return hl.call( call[transmitted_allele_index], call[hl.int(transmitted_allele_index == 0)], phased=True ) def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) ) def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]) ) def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) ) return ( hl.case() .when(locus.in_x_nonpar() & proband_call.is_haploid(), phase_haploid_proband_x_nonpar(proband_call, father_call, mother_call)) .when(locus.in_y_nonpar(), phase_y_nonpar(proband_call, father_call)) .when(proband_call.is_diploid(), phase_diploid_proband(locus, alleles, proband_call, father_call, mother_call)) .or_missing() )