def test_hw_func_and_agg_agree(self): mt = hl.import_vcf(resource('sample.vcf')) mt = mt.annotate_rows(stats=hl.agg.call_stats(mt.GT, mt.alleles), hw=hl.agg.hardy_weinberg_test(mt.GT)) mt = mt.annotate_rows(hw2=hl.hardy_weinberg_test( mt.stats.homozygote_count[0], mt.stats.AC[1] - 2 * mt.stats.homozygote_count[1], mt.stats.homozygote_count[1])) rt = mt.rows() self.assertTrue(rt.all(rt.hw == rt.hw2))
def getVariantStats(ipvDict, studyPerVariant, centersPerHomoVus, inList, outList): allVariants = ipvDict.keys() variantsDict = dict() for v in allVariants: vClass = ipvDict[v]['class'] vPopFreq = '%.4f'%(ipvDict[v]['maxFreq']) vCohortFreq = '%.4f'%(ipvDict[v]['cohortFreq']) aa = str(ipvDict[v]['aa']) Aa = str(ipvDict[v]['Aa']) AA = str(ipvDict[v]['AA']) F = str(ipvDict[v]['F']) Z = str(ipvDict[v]['Z']) p = (2 * int(AA) + int(Aa)) / (2 * (int(AA) + int(Aa) + int(aa))) q = 1 - p exonic = str(ipvDict[v]['exonic']) chisquare = str(ipvDict[v]['chisquare']) if len(ipvDict[v]['homozygous individuals']) == 0: homoSample = "None" else: homoSample = ipvDict[v]['homozygous individuals'][0] if len(ipvDict[v]['heterozygous individuals']) == 0: heteroSample = "None" else: heteroSample = ipvDict[v]['heterozygous individuals'][0] v = v.replace(' ', '') v = v.replace("'", "") study = studyPerVariant[v] if v in inList: vIn = 'True' elif v in outList: vIn = 'False' else: vIn = 'NA' variantsDict[v] = dict() variantsDict[v]['class'] = vClass variantsDict[v]['popFreq'] = vPopFreq variantsDict[v]['cohortFreq'] = vCohortFreq variantsDict[v]['homozygousSample'] = homoSample variantsDict[v]['heterozygousSample'] = heteroSample variantsDict[v]['inGnomad'] = vIn variantsDict[v]['aa'] = aa variantsDict[v]['Aa'] = Aa variantsDict[v]['AA'] = AA variantsDict[v]['hail_hweafp'] = hl.eval(hl.hardy_weinberg_test(int(AA),int(Aa),int(aa))).p_value variantsDict[v]['F'] = F variantsDict[v]['Z'] = Z variantsDict[v]['p'] = p variantsDict[v]['q'] = q variantsDict[v]['chisquare'] = chisquare variantsDict[v]['sequenceCenter'] = str(centersPerHomoVus[v]).replace(" ", "") variantsDict[v]['exonic'] = exonic variantsDict[v]['study'] = study return variantsDict
def test_hw_func_and_agg_agree(self): mt = hl.import_vcf(resource('sample.vcf')) mt = mt.annotate_rows( stats=hl.agg.call_stats(mt.GT, mt.alleles), hw=hl.agg.hardy_weinberg_test(mt.GT)) mt = mt.annotate_rows( hw2=hl.hardy_weinberg_test(mt.stats.homozygote_count[0], mt.stats.AC[1] - 2 * mt.stats.homozygote_count[1], mt.stats.homozygote_count[1])) rt = mt.rows() self.assertTrue(rt.all(rt.hw == rt.hw2))
def variant_qc_aggregator(mt) -> hl.MatrixTable: """:func:`.variant_qc` as an aggregator.""" bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( "'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) n_cols = hl.agg.count() bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) return hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value })))
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `call_rate` (``float32``) -- Fraction of samples with a defined `GT`. Equivalent to `n_called` / :meth:`.count_cols`. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') exprs = {} struct_exprs = [] def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype n_samples = mt.count_cols() if has_field_of_type('DP', hl.tint32): exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) struct_exprs.append(hl.agg.call_stats(mt.GT, mt.alleles)) # the structure of this function makes it easy to add new nested computations def flatten_struct(*struct_exprs): flat = {} for struct in struct_exprs: for k, v in struct.items(): flat[k] = v return hl.struct( **flat, **exprs, ) mt = mt.annotate_rows(**{name: hl.bind(flatten_struct, *struct_exprs)}) hwe = hl.hardy_weinberg_test(mt[name].homozygote_count[0], mt[name].AC[1] - 2 * mt[name].homozygote_count[1], mt[name].homozygote_count[1]) hwe = hwe.select(het_freq_hwe=hwe.het_freq_hwe, p_value_hwe=hwe.p_value) mt = mt.annotate_rows(**{name: mt[name].annotate(n_not_called=n_samples - mt[name].n_called, call_rate=mt[name].n_called / n_samples, n_het=mt[name].n_called - hl.sum(mt[name].homozygote_count), n_non_ref=mt[name].n_called - mt[name].homozygote_count[0], **hl.cond(hl.len(mt.alleles) == 2, hwe, hl.null(hwe.dtype)))}) return mt
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind( hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when( hl.len(mt.alleles) == 2, hl.hardy_weinberg_test( e1.call_stats.homozygote_count[0], e1.call_stats.AC[ 1] - 2 * e1.call_stats.homozygote_count[1], e1. call_stats.homozygote_count[1])).or_missing(), lambda hwe: hl.struct( **{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value }))) return mt.annotate_rows(**{name: result})
def hailHWTest(AA, Aa, aa): return hail.eval(hail.hardy_weinberg_test(AA, Aa, aa)).p_value
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind(hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when(hl.len(mt.alleles) == 2, hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0], e1.call_stats.AC[1] - 2 * e1.call_stats.homozygote_count[1], e1.call_stats.homozygote_count[1]) ).or_missing(), lambda hwe: hl.struct(**{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value}))) return mt.annotate_rows(**{name: result})
def determine_pca_variants( autosomes_only: bool = True, snv_only: bool = True, bi_allelic_only: bool = False, adj_only: bool = True, min_gnomad_v3_ac: Optional[int] = None, high_qual_ccdg_exome_interval_only: bool = False, high_qual_ukbb_exome_interval_only: bool = False, pct_samples_ukbb_exome_interval: float = 0.8, min_joint_af: float = 0.0001, # TODO: Konrad mentioned that he might want to lower this min_joint_callrate: float = 0.95, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, min_ccdg_exome_callrate: float = 0.99, # TODO: What parameter should this start with? min_ukbb_exome_callrate: float = 0.99, # TODO: What parameter should this start with? filter_lcr: bool = True, filter_segdup: bool = True, ld_pruning: bool = True, ld_pruning_dataset: str = "ccdg_genomes", ld_r2: float = 0.1, read_per_dataset_checkpoint_if_exists: bool = False, read_pre_ld_prune_ht_checkpoint_if_exists: bool = False, read_pre_ld_prune_mt_checkpoint_if_exists: bool = False, overwrite: bool = True, filter_washu: bool = False, ) -> None: """ Determine a diverse set of variants for relatedness/ancestry PCA using CCDG, gnomAD v3, and UK Biobank. :param autosomes_only: Whether to filter to variants in autosomes :param snv_only: Whether to filter to SNVs :param bi_allelic_only: Whether to filter to variants that are bi-allelic in either CCDG and gnomAD v3 :param adj_only: If set, only ADJ genotypes (QD >= 2, FS <= 60 and MQ >= 30) are kept. This filter is applied before the call rate and AF calculation :param min_gnomad_v3_ac: Optional lower bound of AC for variants in gnomAD v3 genomes :param high_qual_ccdg_exome_interval_only: Whether to filter to high quality intervals in CCDG exomes :param float pct_samples_ukbb_exome_interval: Percent of samples with over 80% of bases having coverage of over 20x per interval :param high_qual_ukbb_exome_interval_only: Whether to filter to high quality intervals in UKBB 455K exomes :param float pct_samples_ukbb: Percent of samples with coverage greater than 20x over the interval for filtering :param min_joint_af: Lower bound for combined MAF computed from CCDG and gnomAD v3 genomes :param min_joint_callrate: Lower bound for combined callrate computed from CCDG and gnomAD v3 genomes :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to `None` :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to `None` :param min_ccdg_exome_callrate: Lower bound for CCDG exomes callrate :param min_ukbb_exome_callrate: Lower bound for UKBB exomes callrate :param filter_lcr: Whether to filter LCR regions :param filter_segdup: Whether to filter Segdup regions :param ld_pruning: Whether to conduct LD pruning :param ld_pruning_dataset: Which dataset is used for LD pruning, 'ccdg_genomes' or 'gnomAD_genomes' :param ld_r2: LD pruning cutoff :param read_per_dataset_checkpoint_if_exists: Whether to read the CCDG exome/genome pre filtered HT if it exists. Each dataset possible filtered to: autosomes only, SNVs only, gnomAD v3.1.2 AC filter, CCDG high quality exome intervals, and UK Biobank high quality exome intervals :param read_pre_ld_prune_ht_checkpoint_if_exists: Whether to read in the PCA variant HT with no LD-pruning if it exists :param read_pre_ld_prune_mt_checkpoint_if_exists: Whether to read in the checkpointed MT filtered to variants in the PCA variant HT with no LD-pruning if it exists :param overwrite: Whether to overwrite the final variant HT :param filter_washu: Whether to filter out washU samples :return: Table with desired variants for PCA """ if not read_pre_ld_prune_ht_checkpoint_if_exists: logger.info( "Loading gnomAD v3.1.2 release HT and UK Biobank 455K release HT ..." ) flag = "_without_washu" if filter_washu else "" gnomad_ht = gnomad_public_release("genomes").ht() gnomad_ht = gnomad_ht.select( gnomad_was_split=gnomad_ht.was_split, gnomad_AC=gnomad_ht.freq[0].AC, gnomad_AN=gnomad_ht.freq[0].AN, gnomad_genomes_site_inbreeding_coeff=gnomad_ht.info.InbreedingCoeff, gnomad_genomes_homozygote_count=gnomad_ht.freq[0].homozygote_count, ) if min_hardy_weinberg_threshold is not None: gnomad_ht = gnomad_ht.annotate( gnomad_genomes_hwe=hl.hardy_weinberg_test( hl.int32( (gnomad_ht.gnomad_AN / 2) - gnomad_ht.gnomad_genomes_homozygote_count - ( gnomad_ht.gnomad_AC - (gnomad_ht.gnomad_genomes_homozygote_count * 2) ) ), # Num hom ref genotypes hl.int32( ( gnomad_ht.gnomad_AC - (gnomad_ht.gnomad_genomes_homozygote_count * 2) ) ), # Num het genotypes gnomad_ht.gnomad_genomes_homozygote_count, # Num hom alt genotypes ), ) ukbb_ht = hl.read_table(ukbb_release_ht_path("broad", 7)) ukbb_ht = ukbb_ht.select( ukbb_AC=ukbb_ht.freq[0].AC, ukbb_AN=ukbb_ht.freq[0].AN, ) ukbb_meta_ht = hl.read_table(ukbb_meta_ht_path("broad", 7)) # Only count samples used in the UK Biobank exome frequency calculations ukbb_exome_count = ukbb_meta_ht.filter( ukbb_meta_ht.sample_filters.high_quality & hl.is_defined(ukbb_meta_ht.ukbb_meta.batch) & ~ukbb_meta_ht.sample_filters.related ).count() logger.info("Getting CCDG genome and exome sample counts...") ccdg_genome_count = get_ccdg_vds( "genomes", filter_washu=filter_washu ).variant_data.count_cols() logger.info(f"Number of CCDG genome samples: {ccdg_genome_count}...") ccdg_exome_count = get_ccdg_vds("exomes").variant_data.count_cols() logger.info(f"Number of CCDG exome samples: {ccdg_exome_count} ...") def _initial_filter(data_type): """ Get Table of CCDG variants passing desired filters. Possible filters are: - Autosomes only - SNVs only - gnomAD v3.1.2 AC filter - CCDG high quality exome intervals - UK Biobank high quality exome intervals After densification of the VDS, rows are annotated with: - ccdg_{data_type}_was_split - ccdg_{data_type}_AC - ccdg_{data_type}_AN The filtered and annotated rows are returned as a Table and are also checkpointed :param data_type: Whether data is from genomes or exomes :return: Table of CCDG filtered variants """ logger.info( "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...", data_type, ) vds = get_ccdg_vds(data_type, filter_washu=filter_washu) logger.info( f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..." ) vds = hl.vds.split_multi(vds) if autosomes_only: logger.info("Filtering CCDG %s VDS to autosomes...", data_type) vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True) ht = vds.variant_data.rows() variant_filter_expr = True if snv_only: logger.info("Filtering CCDG %s VDS to SNVs...", data_type) variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1]) if min_gnomad_v3_ac: logger.info( "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...", data_type, min_gnomad_v3_ac, ) variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True) if high_qual_ccdg_exome_interval_only: logger.info( f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...", data_type, INTERVAL_DP, ) interval_qc_ht = hl.read_table( get_ccdg_results_path( data_type="exomes", result=f"intervals_{INTERVAL_DP}x" ) ) interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) if high_qual_ukbb_exome_interval_only: if not autosomes_only: raise ValueError( "UK Biobank interval QC filtering is only available for autosomes!" ) logger.info( "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...", data_type, ) interval_qc_ht = hl.read_table( ukbb_interval_qc_path("broad", 7, "autosomes") ) # Note: freeze 7 is all included in gnomAD v4 interval_qc_ht = interval_qc_ht.filter( interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval ) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) logger.info("Densifying filtered CCDG %s VDS...", data_type) mt = hl.vds.to_dense_mt(vds) if adj_only: mt = filter_to_adj(mt) annotation_expr = { f"ccdg_{data_type}_was_split": mt.was_split, f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()), f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2, } if min_inbreeding_coeff_threshold is not None: annotation_expr[ f"ccdg_{data_type}_site_inbreeding_coeff" ] = bi_allelic_site_inbreeding_expr(mt.GT) if min_hardy_weinberg_threshold is not None: annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test( mt.GT ) mt = mt.annotate_rows(**annotation_expr) ht = mt.rows().checkpoint( get_ccdg_results_path( data_type=data_type, mt=False, result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}", ), overwrite=(not read_per_dataset_checkpoint_if_exists), _read_if_exists=read_per_dataset_checkpoint_if_exists, ) return ht logger.info( "Creating Table with joint gnomAD v3.1.2 and CCDG genome allele frequencies and callrate...", ) ccdg_genomes_ht = _initial_filter("genomes") ccdg_exomes_ht = _initial_filter("exomes") ht = ccdg_exomes_ht.join(ccdg_genomes_ht, how="inner") ht = ht.annotate(**gnomad_ht[ht.key], **ukbb_ht[ht.key]) ht = ht.annotate( joint_biallelic=(~ht.ccdg_genomes_was_split) | (~ht.gnomad_was_split), joint_AC=ht.ccdg_genomes_AC + ht.gnomad_AC, joint_AN=ht.ccdg_genomes_AN + ht.gnomad_AN, ) total_genome_an = hl.eval( (gnomad_ht.freq_sample_count[0] + ccdg_genome_count) * 2 ) ht = ht.annotate( joint_AF=ht.joint_AC / ht.joint_AN, joint_callrate=ht.joint_AN / total_genome_an, ) ht = ht.checkpoint( f"{get_joint_pca_variants_ht_path(filter_washu=filter_washu)}", overwrite=(not read_pre_ld_prune_ht_checkpoint_if_exists), _read_if_exists=read_pre_ld_prune_ht_checkpoint_if_exists, ) logger.info( "Filtering variants to combined gnomAD v3.1.2 and CCDG genome AF of %.3f and callrate of %.2f, CCDG exome callrate " "of %.2f, and UK Biobank exome callrate of %.2f....", min_joint_af, min_joint_callrate, min_ccdg_exome_callrate, min_ukbb_exome_callrate, ) variant_filter_expr = True if bi_allelic_only: variant_filter_expr &= ht.joint_biallelic if min_inbreeding_coeff_threshold is not None: variant_filter_expr &= ( ht.ccdg_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold ) & ( ht.gnomad_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold ) if min_hardy_weinberg_threshold is not None: variant_filter_expr &= ( ht.ccdg_genomes_hwe.p_value > min_hardy_weinberg_threshold ) & (ht.gnomad_genomes_hwe.p_value > min_hardy_weinberg_threshold) variant_filter_expr &= ( (ht.joint_AF > min_joint_af) & (ht.joint_callrate > min_joint_callrate) & (ht.ccdg_exomes_AN / (ccdg_exome_count * 2) > min_ccdg_exome_callrate) & (ht.ukbb_AN / (ukbb_exome_count * 2) > min_ukbb_exome_callrate) ) ht = ht.filter(variant_filter_expr) ht = ht.annotate_globals( autosomes_only=autosomes_only, snv_only=snv_only, adj_only=adj_only, bi_allelic_only=bi_allelic_only, min_gnomad_v3_ac=min_gnomad_v3_ac, high_qual_ccdg_exome_interval_only=high_qual_ccdg_exome_interval_only, high_qual_ukbb_exome_interval_only=high_qual_ukbb_exome_interval_only, filter_lcr=filter_lcr, filter_segdup=filter_segdup, min_af=min_joint_af, min_callrate=min_joint_callrate, min_ccdg_exome_callrate=min_ccdg_exome_callrate, min_ukbb_exome_callrate=min_ukbb_exome_callrate, min_inbreeding_coeff_threshold=min_inbreeding_coeff_threshold, min_hardy_weinberg_threshold=min_hardy_weinberg_threshold, ) ht = filter_low_conf_regions( ht, filter_lcr=filter_lcr, filter_decoy=False, # No decoy for GRCh38 filter_segdup=filter_segdup, ) ht = ht.checkpoint( get_pca_variants_path(ld_pruned=False, filter_washu=filter_washu), overwrite=True, ) else: ht = hl.read_table( get_pca_variants_path( ld_pruned=False, data=ld_pruning_dataset, filter_washu=filter_washu ) ) if ld_pruning: # Whether this is still required? logger.warning( "The LD-prune step of this function requires non-preemptible workers only!" ) logger.info("Creating Table after LD pruning of %s...", ld_pruning_dataset) if ld_pruning_dataset == "ccdg_genomes": vds = get_ccdg_vds("genomes") vds = hl.vds.split_multi(vds, filter_changed_loci=True) vds = hl.vds.filter_variants(vds, ht, keep=True) mt = hl.vds.to_dense_mt(vds) elif ld_pruning_dataset == "gnomad_genomes": mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) logger.info("Converting gnomAD v3.1 MatrixTable to VDS...") mt = mt.select_entries( "END", "LA", "LGT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD) ) vds = hl.vds.VariantDataset.from_merged_representation(mt) logger.info("Performing split-multi and filtering variants...") vds = hl.vds.split_multi(vds, filter_changed_loci=True) vds = hl.vds.filter_variants(vds, ht) logger.info("Densifying data...") mt = hl.vds.to_dense_mt(vds) else: ValueError( "Only options for LD pruning are `ccdg_genomes` and `gnomad_genomes`" ) hl._set_flags(no_whole_stage_codegen="1") mt = mt.checkpoint( get_pca_variants_path(ld_pruned=False, data=ld_pruning_dataset, mt=True), overwrite=(not read_pre_ld_prune_mt_checkpoint_if_exists), _read_if_exists=read_pre_ld_prune_mt_checkpoint_if_exists, ) hl._set_flags(no_whole_stage_codegen=None) ht = hl.ld_prune(mt.GT, r2=ld_r2) ht = ht.annotate_globals(ld_r2=ld_r2, ld_pruning_dataset=ld_pruning_dataset) ht = ht.checkpoint( get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset), overwrite=overwrite, _read_if_exists=(not overwrite), ) mt = mt.filter_rows(hl.is_defined(ht[mt.row_key])) mt.naive_coalesce(1000).write( get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset, mt=True), overwrite=overwrite, )