示例#1
0
 def test_hw_func_and_agg_agree(self):
     mt = hl.import_vcf(resource('sample.vcf'))
     mt = mt.annotate_rows(stats=hl.agg.call_stats(mt.GT, mt.alleles),
                           hw=hl.agg.hardy_weinberg_test(mt.GT))
     mt = mt.annotate_rows(hw2=hl.hardy_weinberg_test(
         mt.stats.homozygote_count[0], mt.stats.AC[1] -
         2 * mt.stats.homozygote_count[1], mt.stats.homozygote_count[1]))
     rt = mt.rows()
     self.assertTrue(rt.all(rt.hw == rt.hw2))
示例#2
0
def getVariantStats(ipvDict, studyPerVariant, centersPerHomoVus, inList, outList):
	allVariants = ipvDict.keys()
	variantsDict = dict()
	for v in allVariants:
		vClass = ipvDict[v]['class']
		vPopFreq = '%.4f'%(ipvDict[v]['maxFreq'])
		vCohortFreq = '%.4f'%(ipvDict[v]['cohortFreq'])
		aa = str(ipvDict[v]['aa'])
		Aa = str(ipvDict[v]['Aa'])
		AA = str(ipvDict[v]['AA'])
		F = str(ipvDict[v]['F'])
		Z = str(ipvDict[v]['Z'])
		p = (2 * int(AA) + int(Aa)) / (2 * (int(AA) + int(Aa) + int(aa)))
		q = 1 - p
		exonic = str(ipvDict[v]['exonic'])
		chisquare = str(ipvDict[v]['chisquare'])
		if len(ipvDict[v]['homozygous individuals']) == 0:
			homoSample = "None"
		else:
			homoSample = ipvDict[v]['homozygous individuals'][0]
		if len(ipvDict[v]['heterozygous individuals']) == 0:
			heteroSample = "None"
		else:
			heteroSample = ipvDict[v]['heterozygous individuals'][0]
		v = v.replace(' ', '')
		v = v.replace("'", "")
		study = studyPerVariant[v]
		if v in inList:
			vIn = 'True'
		elif v in outList:
			vIn = 'False'
		else:
			vIn = 'NA'
		variantsDict[v] = dict()
		variantsDict[v]['class'] = vClass
		variantsDict[v]['popFreq'] = vPopFreq
		variantsDict[v]['cohortFreq'] = vCohortFreq

		variantsDict[v]['homozygousSample'] = homoSample
		variantsDict[v]['heterozygousSample'] = heteroSample
		variantsDict[v]['inGnomad'] = vIn
		variantsDict[v]['aa'] = aa
		variantsDict[v]['Aa'] = Aa
		variantsDict[v]['AA'] = AA
		variantsDict[v]['hail_hweafp'] = hl.eval(hl.hardy_weinberg_test(int(AA),int(Aa),int(aa))).p_value
		variantsDict[v]['F'] = F
		variantsDict[v]['Z'] = Z
		variantsDict[v]['p'] = p
		variantsDict[v]['q'] = q
		variantsDict[v]['chisquare'] = chisquare
		variantsDict[v]['sequenceCenter'] = str(centersPerHomoVus[v]).replace(" ", "")
		variantsDict[v]['exonic'] = exonic
		variantsDict[v]['study'] = study

	return variantsDict
示例#3
0
 def test_hw_func_and_agg_agree(self):
     mt = hl.import_vcf(resource('sample.vcf'))
     mt = mt.annotate_rows(
         stats=hl.agg.call_stats(mt.GT, mt.alleles),
         hw=hl.agg.hardy_weinberg_test(mt.GT))
     mt = mt.annotate_rows(
         hw2=hl.hardy_weinberg_test(mt.stats.homozygote_count[0],
                                    mt.stats.AC[1] - 2 * mt.stats.homozygote_count[1],
                                    mt.stats.homozygote_count[1]))
     rt = mt.rows()
     self.assertTrue(rt.all(rt.hw == rt.hw2))
示例#4
0
def variant_qc_aggregator(mt) -> hl.MatrixTable:
    """:func:`.variant_qc` as an aggregator."""
    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')
    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')
    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            "'variant_qc': expect an entry field 'GT' of type 'call'")
    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    n_cols = hl.agg.count()
    bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)
    return hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))
示例#5
0
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `call_rate` (``float32``) -- Fraction of samples with a defined `GT`.
      Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    exprs = {}
    struct_exprs = []

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    n_samples = mt.count_cols()

    if has_field_of_type('DP', hl.tint32):
        exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT',  hl.tcall):
        raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'")
    exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    struct_exprs.append(hl.agg.call_stats(mt.GT, mt.alleles))


    # the structure of this function makes it easy to add new nested computations
    def flatten_struct(*struct_exprs):
        flat = {}
        for struct in struct_exprs:
            for k, v in struct.items():
                flat[k] = v
        return hl.struct(
            **flat,
            **exprs,
        )

    mt = mt.annotate_rows(**{name: hl.bind(flatten_struct, *struct_exprs)})

    hwe = hl.hardy_weinberg_test(mt[name].homozygote_count[0],
                                 mt[name].AC[1] - 2 * mt[name].homozygote_count[1],
                                 mt[name].homozygote_count[1])
    hwe = hwe.select(het_freq_hwe=hwe.het_freq_hwe, p_value_hwe=hwe.p_value)
    mt = mt.annotate_rows(**{name: mt[name].annotate(n_not_called=n_samples - mt[name].n_called,
                                                     call_rate=mt[name].n_called / n_samples,
                                                     n_het=mt[name].n_called - hl.sum(mt[name].homozygote_count),
                                                     n_non_ref=mt[name].n_called - mt[name].homozygote_count[0],
                                                     **hl.cond(hl.len(mt.alleles) == 2,
                                                               hwe,
                                                               hl.null(hwe.dtype)))})
    return mt
示例#6
0
文件: qc.py 项目: troels/hail
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered.
      Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            f"'variant_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)

    result = hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))

    return mt.annotate_rows(**{name: result})
 def hailHWTest(AA, Aa, aa):
     return hail.eval(hail.hardy_weinberg_test(AA, Aa, aa)).p_value
示例#8
0
文件: qc.py 项目: jigold/hail
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered.
       Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT',  hl.tcall):
        raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)

    result = hl.rbind(hl.struct(**bound_exprs),
                      lambda e1: hl.rbind(
                          hl.case().when(hl.len(mt.alleles) == 2,
                                         hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0],
                                                                e1.call_stats.AC[1] - 2 *
                                                                e1.call_stats.homozygote_count[1],
                                                                e1.call_stats.homozygote_count[1])
                                         ).or_missing(),
                          lambda hwe: hl.struct(**{
                              **gq_dp_exprs,
                              **e1.call_stats,
                              'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered),
                              'n_called': e1.n_called,
                              'n_not_called': e1.n_not_called,
                              'n_filtered': e1.n_filtered,
                              'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                              'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0],
                              'het_freq_hwe': hwe.het_freq_hwe,
                              'p_value_hwe': hwe.p_value})))

    return mt.annotate_rows(**{name: result})
示例#9
0
def determine_pca_variants(
    autosomes_only: bool = True,
    snv_only: bool = True,
    bi_allelic_only: bool = False,
    adj_only: bool = True,
    min_gnomad_v3_ac: Optional[int] = None,
    high_qual_ccdg_exome_interval_only: bool = False,
    high_qual_ukbb_exome_interval_only: bool = False,
    pct_samples_ukbb_exome_interval: float = 0.8,
    min_joint_af: float = 0.0001,  # TODO: Konrad mentioned that he might want to lower this
    min_joint_callrate: float = 0.95,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    min_ccdg_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    min_ukbb_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    filter_lcr: bool = True,
    filter_segdup: bool = True,
    ld_pruning: bool = True,
    ld_pruning_dataset: str = "ccdg_genomes",
    ld_r2: float = 0.1,
    read_per_dataset_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_ht_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_mt_checkpoint_if_exists: bool = False,
    overwrite: bool = True,
    filter_washu: bool = False,
) -> None:
    """
    Determine a diverse set of variants for relatedness/ancestry PCA using CCDG, gnomAD v3, and UK Biobank.

    :param autosomes_only: Whether to filter to variants in autosomes
    :param snv_only: Whether to filter to SNVs
    :param bi_allelic_only: Whether to filter to variants that are bi-allelic in either CCDG and gnomAD v3
    :param adj_only: If set, only ADJ genotypes (QD >= 2, FS <= 60 and MQ >= 30) are kept. This filter is applied before the call rate and AF calculation
    :param min_gnomad_v3_ac: Optional lower bound of AC for variants in gnomAD v3 genomes
    :param high_qual_ccdg_exome_interval_only: Whether to filter to high quality intervals in CCDG exomes
    :param float pct_samples_ukbb_exome_interval: Percent of samples with over 80% of bases having coverage of over 20x per interval
    :param high_qual_ukbb_exome_interval_only: Whether to filter to high quality intervals in UKBB 455K exomes
    :param float pct_samples_ukbb: Percent of samples with coverage greater than 20x over the interval for filtering
    :param min_joint_af: Lower bound for combined MAF computed from CCDG and gnomAD v3 genomes
    :param min_joint_callrate: Lower bound for combined callrate computed from CCDG and gnomAD v3 genomes
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to `None`
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to `None`
    :param min_ccdg_exome_callrate: Lower bound for CCDG exomes callrate
    :param min_ukbb_exome_callrate: Lower bound for UKBB exomes callrate
    :param filter_lcr: Whether to filter LCR regions
    :param filter_segdup: Whether to filter Segdup regions
    :param ld_pruning: Whether to conduct LD pruning
    :param ld_pruning_dataset: Which dataset is used for LD pruning, 'ccdg_genomes' or 'gnomAD_genomes'
    :param ld_r2: LD pruning cutoff
    :param read_per_dataset_checkpoint_if_exists: Whether to read the CCDG exome/genome pre filtered HT if it exists.
        Each dataset possible filtered to: autosomes only, SNVs only, gnomAD v3.1.2 AC filter, CCDG high quality exome
        intervals, and UK Biobank high quality exome intervals
    :param read_pre_ld_prune_ht_checkpoint_if_exists: Whether to read in the PCA variant HT with no LD-pruning if it exists
    :param read_pre_ld_prune_mt_checkpoint_if_exists: Whether to read in the checkpointed MT filtered to variants in the
        PCA variant HT with no LD-pruning if it exists
    :param overwrite: Whether to overwrite the final variant HT
    :param filter_washu: Whether to filter out washU samples
    :return: Table with desired variants for PCA
    """
    if not read_pre_ld_prune_ht_checkpoint_if_exists:
        logger.info(
            "Loading gnomAD v3.1.2 release HT and UK Biobank 455K release HT ..."
        )
        flag = "_without_washu" if filter_washu else ""
        gnomad_ht = gnomad_public_release("genomes").ht()
        gnomad_ht = gnomad_ht.select(
            gnomad_was_split=gnomad_ht.was_split,
            gnomad_AC=gnomad_ht.freq[0].AC,
            gnomad_AN=gnomad_ht.freq[0].AN,
            gnomad_genomes_site_inbreeding_coeff=gnomad_ht.info.InbreedingCoeff,
            gnomad_genomes_homozygote_count=gnomad_ht.freq[0].homozygote_count,
        )
        if min_hardy_weinberg_threshold is not None:
            gnomad_ht = gnomad_ht.annotate(
                gnomad_genomes_hwe=hl.hardy_weinberg_test(
                    hl.int32(
                        (gnomad_ht.gnomad_AN / 2)
                        - gnomad_ht.gnomad_genomes_homozygote_count
                        - (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num hom ref genotypes
                    hl.int32(
                        (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num het genotypes
                    gnomad_ht.gnomad_genomes_homozygote_count,  # Num hom alt genotypes
                ),
            )

        ukbb_ht = hl.read_table(ukbb_release_ht_path("broad", 7))
        ukbb_ht = ukbb_ht.select(
            ukbb_AC=ukbb_ht.freq[0].AC,
            ukbb_AN=ukbb_ht.freq[0].AN,
        )
        ukbb_meta_ht = hl.read_table(ukbb_meta_ht_path("broad", 7))

        # Only count samples used in the UK Biobank exome frequency calculations
        ukbb_exome_count = ukbb_meta_ht.filter(
            ukbb_meta_ht.sample_filters.high_quality
            & hl.is_defined(ukbb_meta_ht.ukbb_meta.batch)
            & ~ukbb_meta_ht.sample_filters.related
        ).count()

        logger.info("Getting CCDG genome and exome sample counts...")
        ccdg_genome_count = get_ccdg_vds(
            "genomes", filter_washu=filter_washu
        ).variant_data.count_cols()
        logger.info(f"Number of CCDG genome samples: {ccdg_genome_count}...")
        ccdg_exome_count = get_ccdg_vds("exomes").variant_data.count_cols()
        logger.info(f"Number of CCDG exome samples: {ccdg_exome_count} ...")

        def _initial_filter(data_type):
            """
            Get Table of CCDG variants passing desired filters.

            Possible filters are:
                - Autosomes only
                - SNVs only
                - gnomAD v3.1.2 AC filter
                - CCDG high quality exome intervals
                - UK Biobank high quality exome intervals

            After densification of the VDS, rows are annotated with:
                - ccdg_{data_type}_was_split
                - ccdg_{data_type}_AC
                - ccdg_{data_type}_AN

            The filtered and annotated rows are returned as a Table and are also checkpointed
            :param data_type: Whether data is from genomes or exomes

            :return: Table of CCDG filtered variants
            """
            logger.info(
                "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...",
                data_type,
            )
            vds = get_ccdg_vds(data_type, filter_washu=filter_washu)
            logger.info(
                f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..."
            )
            vds = hl.vds.split_multi(vds)

            if autosomes_only:
                logger.info("Filtering CCDG %s VDS to autosomes...", data_type)
                vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)

            ht = vds.variant_data.rows()
            variant_filter_expr = True
            if snv_only:
                logger.info("Filtering CCDG %s VDS to SNVs...", data_type)
                variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1])

            if min_gnomad_v3_ac:
                logger.info(
                    "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...",
                    data_type,
                    min_gnomad_v3_ac,
                )
                variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac

            vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True)

            if high_qual_ccdg_exome_interval_only:
                logger.info(
                    f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...",
                    data_type,
                    INTERVAL_DP,
                )
                interval_qc_ht = hl.read_table(
                    get_ccdg_results_path(
                        data_type="exomes", result=f"intervals_{INTERVAL_DP}x"
                    )
                )
                interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep)
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            if high_qual_ukbb_exome_interval_only:
                if not autosomes_only:
                    raise ValueError(
                        "UK Biobank interval QC filtering is only available for autosomes!"
                    )

                logger.info(
                    "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...",
                    data_type,
                )
                interval_qc_ht = hl.read_table(
                    ukbb_interval_qc_path("broad", 7, "autosomes")
                )  # Note: freeze 7 is all included in gnomAD v4
                interval_qc_ht = interval_qc_ht.filter(
                    interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval
                )
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            logger.info("Densifying filtered CCDG %s VDS...", data_type)
            mt = hl.vds.to_dense_mt(vds)
            if adj_only:
                mt = filter_to_adj(mt)

            annotation_expr = {
                f"ccdg_{data_type}_was_split": mt.was_split,
                f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()),
                f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2,
            }

            if min_inbreeding_coeff_threshold is not None:
                annotation_expr[
                    f"ccdg_{data_type}_site_inbreeding_coeff"
                ] = bi_allelic_site_inbreeding_expr(mt.GT)
            if min_hardy_weinberg_threshold is not None:
                annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test(
                    mt.GT
                )

            mt = mt.annotate_rows(**annotation_expr)
            ht = mt.rows().checkpoint(
                get_ccdg_results_path(
                    data_type=data_type,
                    mt=False,
                    result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}",
                ),
                overwrite=(not read_per_dataset_checkpoint_if_exists),
                _read_if_exists=read_per_dataset_checkpoint_if_exists,
            )

            return ht

        logger.info(
            "Creating Table with joint gnomAD v3.1.2 and CCDG genome allele frequencies and callrate...",
        )
        ccdg_genomes_ht = _initial_filter("genomes")
        ccdg_exomes_ht = _initial_filter("exomes")
        ht = ccdg_exomes_ht.join(ccdg_genomes_ht, how="inner")
        ht = ht.annotate(**gnomad_ht[ht.key], **ukbb_ht[ht.key])
        ht = ht.annotate(
            joint_biallelic=(~ht.ccdg_genomes_was_split) | (~ht.gnomad_was_split),
            joint_AC=ht.ccdg_genomes_AC + ht.gnomad_AC,
            joint_AN=ht.ccdg_genomes_AN + ht.gnomad_AN,
        )
        total_genome_an = hl.eval(
            (gnomad_ht.freq_sample_count[0] + ccdg_genome_count) * 2
        )
        ht = ht.annotate(
            joint_AF=ht.joint_AC / ht.joint_AN,
            joint_callrate=ht.joint_AN / total_genome_an,
        )
        ht = ht.checkpoint(
            f"{get_joint_pca_variants_ht_path(filter_washu=filter_washu)}",
            overwrite=(not read_pre_ld_prune_ht_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_ht_checkpoint_if_exists,
        )

        logger.info(
            "Filtering variants to combined gnomAD v3.1.2 and CCDG genome AF of %.3f and callrate of %.2f, CCDG exome callrate "
            "of %.2f, and UK Biobank exome callrate of %.2f....",
            min_joint_af,
            min_joint_callrate,
            min_ccdg_exome_callrate,
            min_ukbb_exome_callrate,
        )

        variant_filter_expr = True
        if bi_allelic_only:
            variant_filter_expr &= ht.joint_biallelic
        if min_inbreeding_coeff_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            ) & (
                ht.gnomad_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            )
        if min_hardy_weinberg_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_hwe.p_value > min_hardy_weinberg_threshold
            ) & (ht.gnomad_genomes_hwe.p_value > min_hardy_weinberg_threshold)

        variant_filter_expr &= (
            (ht.joint_AF > min_joint_af)
            & (ht.joint_callrate > min_joint_callrate)
            & (ht.ccdg_exomes_AN / (ccdg_exome_count * 2) > min_ccdg_exome_callrate)
            & (ht.ukbb_AN / (ukbb_exome_count * 2) > min_ukbb_exome_callrate)
        )

        ht = ht.filter(variant_filter_expr)

        ht = ht.annotate_globals(
            autosomes_only=autosomes_only,
            snv_only=snv_only,
            adj_only=adj_only,
            bi_allelic_only=bi_allelic_only,
            min_gnomad_v3_ac=min_gnomad_v3_ac,
            high_qual_ccdg_exome_interval_only=high_qual_ccdg_exome_interval_only,
            high_qual_ukbb_exome_interval_only=high_qual_ukbb_exome_interval_only,
            filter_lcr=filter_lcr,
            filter_segdup=filter_segdup,
            min_af=min_joint_af,
            min_callrate=min_joint_callrate,
            min_ccdg_exome_callrate=min_ccdg_exome_callrate,
            min_ukbb_exome_callrate=min_ukbb_exome_callrate,
            min_inbreeding_coeff_threshold=min_inbreeding_coeff_threshold,
            min_hardy_weinberg_threshold=min_hardy_weinberg_threshold,
        )

        ht = filter_low_conf_regions(
            ht,
            filter_lcr=filter_lcr,
            filter_decoy=False,  # No decoy for GRCh38
            filter_segdup=filter_segdup,
        )

        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=False, filter_washu=filter_washu),
            overwrite=True,
        )
    else:
        ht = hl.read_table(
            get_pca_variants_path(
                ld_pruned=False, data=ld_pruning_dataset, filter_washu=filter_washu
            )
        )

    if ld_pruning:
        # Whether this is still required?
        logger.warning(
            "The LD-prune step of this function requires non-preemptible workers only!"
        )
        logger.info("Creating Table after LD pruning of %s...", ld_pruning_dataset)
        if ld_pruning_dataset == "ccdg_genomes":
            vds = get_ccdg_vds("genomes")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht, keep=True)
            mt = hl.vds.to_dense_mt(vds)
        elif ld_pruning_dataset == "gnomad_genomes":
            mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
            logger.info("Converting gnomAD v3.1 MatrixTable to VDS...")
            mt = mt.select_entries(
                "END", "LA", "LGT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD)
            )
            vds = hl.vds.VariantDataset.from_merged_representation(mt)

            logger.info("Performing split-multi and filtering variants...")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht)

            logger.info("Densifying data...")
            mt = hl.vds.to_dense_mt(vds)
        else:
            ValueError(
                "Only options for LD pruning are `ccdg_genomes` and `gnomad_genomes`"
            )

        hl._set_flags(no_whole_stage_codegen="1")
        mt = mt.checkpoint(
            get_pca_variants_path(ld_pruned=False, data=ld_pruning_dataset, mt=True),
            overwrite=(not read_pre_ld_prune_mt_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_mt_checkpoint_if_exists,
        )
        hl._set_flags(no_whole_stage_codegen=None)
        ht = hl.ld_prune(mt.GT, r2=ld_r2)
        ht = ht.annotate_globals(ld_r2=ld_r2, ld_pruning_dataset=ld_pruning_dataset)
        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset),
            overwrite=overwrite,
            _read_if_exists=(not overwrite),
        )
        mt = mt.filter_rows(hl.is_defined(ht[mt.row_key]))
        mt.naive_coalesce(1000).write(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset, mt=True),
            overwrite=overwrite,
        )