def _finish(self, default): assert len(self._cases) > 0 from hail.expr.functions import cond expr = default for conditional, then in self._cases[::-1]: expr = cond(conditional, then, expr, missing_false=self._missing_false) return expr
def hwe_normalized_pca(dataset, k=10, compute_loadings=False, as_array=False): """Run principal component analysis (PCA) on the Hardy-Weinberg-normalized call matrix. Examples -------- >>> eigenvalues, scores, loadings = methods.hwe_normalized_pca(dataset, k=5) Notes ----- Variants that are all homozygous reference or all homozygous variant are removed before evaluation. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. as_array : :obj:`bool` If ``True``, return scores and loadings as an array field. If ``False``, return one field per element (`PC1`, `PC2`, ... `PCk`). Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()), n_called=agg.count_where( functions.is_defined(dataset.GT))) dataset = dataset.filter_rows( (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist() n_variants = dataset.count_rows() if n_variants == 0: raise FatalError( "Cannot run PCA: found 0 variants after filtering out monomorphic sites." ) info("Running PCA using {} variants.".format(n_variants)) entry_expr = functions.bind( dataset.AC / dataset.n_called, lambda mean_gt: functions.cond( functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles( ) - mean_gt) / functions.sqrt(mean_gt * (2 - mean_gt) * n_variants / 2), 0)) result = pca(entry_expr, k, compute_loadings, as_array) dataset.unpersist() return result
def f(base): # build cond chain bottom-up expr = default for condition, then in self._cases[::-1]: expr = cond(condition, then, expr) return expr
def grm(dataset): """Compute the Genetic Relatedness Matrix (GRM). .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- >>> km = methods.grm(dataset) Notes ----- The genetic relationship matrix (GRM) :math:`G` encodes genetic correlation between each pair of samples. It is defined by :math:`G = MM^T` where :math:`M` is a standardized version of the genotype matrix, computed as follows. Let :math:`C` be the :math:`n \\times m` matrix of raw genotypes in the variant dataset, with rows indexed by :math:`n` samples and columns indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes genotype variances to a common value :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further motivated in the paper `Patterson, Price and Reich, 2006 <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__. (The resulting amplification of signal from the low end of the allele frequency spectrum will also introduce noise for rare variants; common practice is to filter out variants with minor allele frequency below some cutoff.) The factor :math:`1/m` gives each sample row approximately unit total variance (assuming linkage equilibrium) so that the diagonal entries of the GRM are approximately 1. Equivalently, .. math:: G_{ik} = \\frac{1}{m} \\sum_{j=1}^m \\frac{(C_{ij}-2p_j)(C_{kj}-2p_j)}{2 p_j (1-p_j)} Warning ------- Since Hardy-Weinberg normalization cannot be applied to variants that contain only reference alleles or only alternate alleles, all such variants are removed prior to calcularing the GRM. Parameters ---------- dataset : :class:`.MatrixTable` Dataset to sample from. Returns ------- :class:`genetics.KinshipMatrix` Genetic Relatedness Matrix for all samples. :rtype: """ dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()), n_called=agg.count_where( functions.is_defined(dataset.GT))) dataset = dataset.filter_rows( (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist() n_variants = dataset.count_rows() if n_variants == 0: raise FatalError( "Cannot run GRM: found 0 variants after filtering out monomorphic sites." ) info("Computing GRM using {} variants.".format(n_variants)) normalized_genotype_expr = functions.bind( dataset.AC / dataset.n_called, lambda mean_gt: functions.cond( functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles( ) - mean_gt) / functions.sqrt(mean_gt * (2 - mean_gt) * n_variants / 2), 0)) bm = BlockMatrix.from_matrix_table(normalized_genotype_expr) dataset.unpersist() grm = bm.T.dot(bm) return KinshipMatrix._from_block_matrix( dataset.colkey_schema, grm, [row.s for row in dataset.cols_table().select('s').collect()], n_variants)