def _make_tsm(entry_expr, block_size): mt = matrix_table_source('_make_tsm/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
def hwe_normalize(call_expr): mt = matrix_table_source('hwe_normalize/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "hwe_normalize: found 0 variants after filtering out monomorphic sites." ) info( f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else( (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return normalized_gt
def mt_to_table_of_ndarray(entry_expr, block_size=16, return_checkpointed_table_also=False): check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('mt_to_table_of_ndarray/entry_expr', entry_expr) if entry_expr in mt._fields_inverse: field = mt._fields_inverse[entry_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: entry_expr}) mt = mt.select_cols().select_rows().select_globals() mt = mt.select_entries(x=mt[field]) def get_even_partitioning(ht, partition_size, total_num_rows): ht = ht.select().add_index("_even_partitioning_index") filt = ht.filter((ht._even_partitioning_index % partition_size == 0) | (ht._even_partitioning_index == (total_num_rows - 1))) interval_bounds = filt.select().collect() intervals = [] num_intervals = len(interval_bounds) for i in range(num_intervals - 2): intervals.append( hl.utils.Interval(start=interval_bounds[i], end=interval_bounds[i + 1], includes_start=True, includes_end=False)) last_interval = hl.utils.Interval( start=interval_bounds[num_intervals - 2], end=interval_bounds[num_intervals - 1], includes_start=True, includes_end=True) intervals.append(last_interval) return intervals ht = mt.localize_entries(entries_array_field_name="entries", columns_array_field_name="cols") ht = ht.select(xs=ht.entries.map(lambda e: e['x'])) temp_file_name = hl.utils.new_temp_file("mt_to_table_of_ndarray", "ht") ht = ht.checkpoint(temp_file_name) num_rows = ht.count() new_partitioning = get_even_partitioning(ht, block_size, num_rows) new_part_ht = hl.read_table(temp_file_name, _intervals=new_partitioning) grouped = new_part_ht._group_within_partitions("groups", block_size) A = grouped.select( ndarray=hl.nd.array(grouped.groups.map(lambda group: group.xs))) if return_checkpointed_table_also: return A, ht return A
def _make_tsm_from_call(call_expr, block_size, mean_center=False, hwe_normalize=False): mt = matrix_table_source('_make_tsm/entry_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) if mean_center or hwe_normalize: mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined( mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "_make_tsm: found 0 variants after filtering out monomorphic sites." ) info( f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.unfilter_entries() mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0)) if hwe_normalize: mt = mt.annotate_rows( __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev) else: mt = mt.select_entries(__x=mt.__gt) A, ht = mt_to_table_of_ndarray(mt.__x, block_size, return_checkpointed_table_also=True) A = A.persist() return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
def pc_relate(call_expr, min_individual_maf, *, k=None, scores_expr=None, min_kinship=None, statistics="all", block_size=None, include_self_kinship=False) -> Table: r"""Compute relatedness estimates between individuals using a variant of the PC-Relate method. .. include:: ../_templates/req_diploid_gt.rst Examples -------- Estimate kinship, identity-by-descent two, identity-by-descent one, and identity-by-descent zero for every pair of samples, using a minimum minor allele frequency filter of 0.01 and 10 principal components to control for population structure. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10) Only compute the kinship statistic. This is more efficient than computing all statistics. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, statistics='kin') Compute all statistics, excluding sample-pairs with kinship less than 0.1. This is more efficient than producing the full table and then filtering using :meth:`.Table.filter`. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, min_kinship=0.1) One can also pass in pre-computed principal component scores. To produce the same results as in the previous example: >>> _, scores_table, _ = hl.hwe_normalized_pca(dataset.GT, ... k=10, ... compute_loadings=False) >>> rel = hl.pc_relate(dataset.GT, ... 0.01, ... scores_expr=scores_table[dataset.col_key].scores, ... min_kinship=0.1) Notes ----- The traditional estimator for kinship between a pair of individuals :math:`i` and :math:`j`, sharing the set :math:`S_{ij}` of single-nucleotide variants, from a population with allele frequencies :math:`p_s`, is given by: .. math:: \widehat{\phi_{ij}} \coloneqq \frac{1}{|S_{ij}|} \sum_{s \in S_{ij}} \frac{(g_{is} - 2 p_s) (g_{js} - 2 p_s)} {4 \sum_{s \in S_{ij}} p_s (1 - p_s)} This estimator is true under the model that the sharing of common (relative to the population) alleles is not very informative to relatedness (because they're common) and the sharing of rare alleles suggests a recent common ancestor from which the allele was inherited by descent. When multiple ancestry groups are mixed in a sample, this model breaks down. Alleles that are rare in all but one ancestry group are treated as very informative to relatedness. However, these alleles are simply markers of the ancestry group. The PC-Relate method corrects for this situation and the related situation of admixed individuals. PC-Relate slightly modifies the usual estimator for relatedness: occurrences of population allele frequency are replaced with an "individual-specific allele frequency". This modification allows the method to correctly weight an allele according to an individual's unique ancestry profile. The "individual-specific allele frequency" at a given genetic locus is modeled by PC-Relate as a linear function of a sample's first ``k`` principal component coordinates. As such, the efficacy of this method rests on two assumptions: - an individual's first `k` principal component coordinates fully describe their allele-frequency-relevant ancestry, and - the relationship between ancestry (as described by principal component coordinates) and population allele frequency is linear The estimators for kinship, and identity-by-descent zero, one, and two follow. Let: - :math:`S_{ij}` be the set of genetic loci at which both individuals :math:`i` and :math:`j` have a defined genotype - :math:`g_{is} \in {0, 1, 2}` be the number of alternate alleles that individual :math:`i` has at genetic locus :math:`s` - :math:`\widehat{\mu_{is}} \in [0, 1]` be the individual-specific allele frequency for individual :math:`i` at genetic locus :math:`s` - :math:`{\widehat{\sigma^2_{is}}} \coloneqq \widehat{\mu_{is}} (1 - \widehat{\mu_{is}})`, the binomial variance of :math:`\widehat{\mu_{is}}` - :math:`\widehat{\sigma_{is}} \coloneqq \sqrt{\widehat{\sigma^2_{is}}}`, the binomial standard deviation of :math:`\widehat{\mu_{is}}` - :math:`\text{IBS}^{(0)}_{ij} \coloneqq \sum_{s \in S_{ij}} \mathbb{1}_{||g_{is} - g_{js} = 2||}`, the number of genetic loci at which individuals :math:`i` and :math:`j` share no alleles - :math:`\widehat{f_i} \coloneqq 2 \widehat{\phi_{ii}} - 1`, the inbreeding coefficient for individual :math:`i` - :math:`g^D_{is}` be a dominance encoding of the genotype matrix, and :math:`X_{is}` be a normalized dominance-coded genotype matrix .. math:: g^D_{is} \coloneqq \begin{cases} \widehat{\mu_{is}} & g_{is} = 0 \\ 0 & g_{is} = 1 \\ 1 - \widehat{\mu_{is}} & g_{is} = 2 \end{cases} \qquad X_{is} \coloneqq g^D_{is} - \widehat{\sigma^2_{is}} (1 - \widehat{f_i}) The estimator for kinship is given by: .. math:: \widehat{\phi_{ij}} \coloneqq \frac{\sum_{s \in S_{ij}}(g - 2 \mu)_{is} (g - 2 \mu)_{js}} {4 * \sum_{s \in S_{ij}} \widehat{\sigma_{is}} \widehat{\sigma_{js}}} The estimator for identity-by-descent two is given by: .. math:: \widehat{k^{(2)}_{ij}} \coloneqq \frac{\sum_{s \in S_{ij}}X_{is} X_{js}}{\sum_{s \in S_{ij}} \widehat{\sigma^2_{is}} \widehat{\sigma^2_{js}}} The estimator for identity-by-descent zero is given by: .. math:: \widehat{k^{(0)}_{ij}} \coloneqq \begin{cases} \frac{\text{IBS}^{(0)}_{ij}} {\sum_{s \in S_{ij}} \widehat{\mu_{is}}^2(1 - \widehat{\mu_{js}})^2 + (1 - \widehat{\mu_{is}})^2\widehat{\mu_{js}}^2} & \widehat{\phi_{ij}} > 2^{-5/2} \\ 1 - 4 \widehat{\phi_{ij}} + k^{(2)}_{ij} & \widehat{\phi_{ij}} \le 2^{-5/2} \end{cases} The estimator for identity-by-descent one is given by: .. math:: \widehat{k^{(1)}_{ij}} \coloneqq 1 - \widehat{k^{(2)}_{ij}} - \widehat{k^{(0)}_{ij}} Note that, even if present, phase information is ignored by this method. The PC-Relate method is described in "Model-free Estimation of Recent Genetic Relatedness". Conomos MP, Reiner AP, Weir BS, Thornton TA. in American Journal of Human Genetics. 2016 Jan 7. The reference implementation is available in the `GENESIS Bioconductor package <https://bioconductor.org/packages/release/bioc/html/GENESIS.html>`_ . :func:`.pc_relate` differs from the reference implementation in a few ways: - if `k` is supplied, samples scores are computed via PCA on all samples, not a specified subset of genetically unrelated samples. The latter can be achieved by filtering samples, computing PCA variant loadings, and using these loadings to compute and pass in scores for all samples. - the estimators do not perform small sample correction - the algorithm does not provide an option to use population-wide allele frequency estimates - the algorithm does not provide an option to not use "overall standardization" (see R ``pcrelate`` documentation) Under the PC-Relate model, kinship, :math:`\phi_{ij}`, ranges from 0 to 0.5, and is precisely half of the fraction-of-genetic-material-shared. Listed below are the statistics for a few pairings: - Monozygotic twins share all their genetic material so their kinship statistic is 0.5 in expection. - Parent-child and sibling pairs both have kinship 0.25 in expectation and are separated by the identity-by-descent-zero, :math:`k^{(2)}_{ij}`, statistic which is zero for parent-child pairs and 0.25 for sibling pairs. - Avuncular pairs and grand-parent/-child pairs both have kinship 0.125 in expectation and both have identity-by-descent-zero 0.5 in expectation - "Third degree relatives" are those pairs sharing :math:`2^{-3} = 12.5 %` of their genetic material, the results of PCRelate are often too noisy to reliably distinguish these pairs from higher-degree-relative-pairs or unrelated pairs. Note that :math:`g_{is}` is the number of alternate alleles. Hence, for multi-allelic variants, a value of 2 may indicate two distinct alternative alleles rather than a homozygous variant genotype. To enforce the latter, either filter or split multi-allelic variants first. The resulting table has the first 3, 4, 5, or 6 fields below, depending on the `statistics` parameter: - `i` (``col_key.dtype``) -- First sample. (key field) - `j` (``col_key.dtype``) -- Second sample. (key field) - `kin` (:py:data:`.tfloat64`) -- Kinship estimate, :math:`\widehat{\phi_{ij}}`. - `ibd2` (:py:data:`.tfloat64`) -- IBD2 estimate, :math:`\widehat{k^{(2)}_{ij}}`. - `ibd0` (:py:data:`.tfloat64`) -- IBD0 estimate, :math:`\widehat{k^{(0)}_{ij}}`. - `ibd1` (:py:data:`.tfloat64`) -- IBD1 estimate, :math:`\widehat{k^{(1)}_{ij}}`. Here ``col_key`` refers to the column key of the source matrix table, and ``col_key.dtype`` is a struct containing the column key fields. There is one row for each pair of distinct samples (columns), where `i` corresponds to the column of smaller column index. In particular, if the same column key value exists for :math:`n` columns, then the resulting table will have :math:`\binom{n-1}{2}` rows with both key fields equal to that column key value. This may result in unexpected behavior in downstream processing. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression. min_individual_maf : :obj:`float` The minimum individual-specific minor allele frequency. If either individual-specific minor allele frequency for a pair of individuals is below this threshold, then the variant will not be used to estimate relatedness for the pair. k : :obj:`int`, optional If set, `k` principal component scores are computed and used. Exactly one of `k` and `scores_expr` must be specified. scores_expr : :class:`.ArrayNumericExpression`, optional Column-indexed expression of principal component scores, with the same source as `call_expr`. All array values must have the same positive length, corresponding to the number of principal components, and all scores must be non-missing. Exactly one of `k` and `scores_expr` must be specified. min_kinship : :obj:`float`, optional If set, pairs of samples with kinship lower than `min_kinship` are excluded from the results. statistics : :class:`str` Set of statistics to compute. If ``'kin'``, only estimate the kinship statistic. If ``'kin2'``, estimate the above and IBD2. If ``'kin20'``, estimate the above and IBD0. If ``'all'``, estimate the above and IBD1. block_size : :obj:`int`, optional Block size of block matrices used in the algorithm. Default given by :meth:`.BlockMatrix.default_block_size`. include_self_kinship: :obj:`bool` If ``True``, include entries for an individual's estimated kinship with themselves. Defaults to ``False``. Returns ------- :class:`.Table` A :class:`.Table` mapping pairs of samples to their pair-wise statistics. """ mt = matrix_table_source('pc_relate/call_expr', call_expr) if k and scores_expr is None: _, scores, _ = hwe_normalized_pca(call_expr, k, compute_loadings=False) scores_expr = scores[mt.col_key].scores elif not k and scores_expr is not None: analyze('pc_relate/scores_expr', scores_expr, mt._col_indices) elif k and scores_expr is not None: raise ValueError( "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found both" ) else: raise ValueError( "pc_relate: exactly one of 'k' and 'scores_expr' must be set, found neither" ) scores_table = mt.select_cols(__scores=scores_expr)\ .key_cols_by().select_cols('__scores').cols() n_missing = scores_table.aggregate( agg.count_where(hl.is_missing(scores_table.__scores))) if n_missing > 0: raise ValueError( f'Found {n_missing} columns with missing scores array.') mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries() mt = mt.annotate_rows(__mean_gt=agg.mean(mt.__gt)) mean_imputed_gt = hl.or_else(hl.float64(mt.__gt), mt.__mean_gt) if not block_size: block_size = BlockMatrix.default_block_size() g = BlockMatrix.from_entry_expr(mean_imputed_gt, block_size=block_size) pcs = scores_table.collect(_localize=False).map(lambda x: x.__scores) ht = Table( ir.BlockMatrixToTableApply( g._bmir, pcs._ir, { 'name': 'PCRelate', 'maf': min_individual_maf, 'blockSize': block_size, 'minKinship': min_kinship, 'statistics': { 'kin': 0, 'kin2': 1, 'kin20': 2, 'all': 3 }[statistics] })) if statistics == 'kin': ht = ht.drop('ibd0', 'ibd1', 'ibd2') elif statistics == 'kin2': ht = ht.drop('ibd0', 'ibd1') elif statistics == 'kin20': ht = ht.drop('ibd1') if not include_self_kinship: ht = ht.filter(ht.i == ht.j, keep=False) col_keys = hl.literal(mt.select_cols().key_cols_by().cols().collect(), dtype=tarray(mt.col_key.dtype)) return ht.key_by(i=col_keys[ht.i], j=col_keys[ht.j])
def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters q = q_iterations L = k + oversampling_param n = A.take(1)[0].ndarray.shape[1] # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW U, S, V = hailBlanczos(A, G, k, q) scores = V.transpose() * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) lt = ht.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt else: return eigens, st, None
def pca(entry_expr, k=10, compute_loadings=False) -> Tuple[List[float], Table, Table]: r"""Run principal component analysis (PCA) on numeric columns derived from a matrix table. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl.pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('pca/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) # FIXME: remove once select_entries on a field is free if entry_expr in mt._fields_inverse: field = mt._fields_inverse[entry_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: entry_expr}) mt = mt.select_cols().select_rows().select_globals() t = (Table( ir.MatrixToTableApply( mt._mir, { 'name': 'PCA', 'entryField': field, 'k': k, 'computeLoadings': compute_loadings })).persist()) g = t.index_globals() scores = hl.Table.parallelize(g.scores, key=list(mt.col_key)) if not compute_loadings: t = None return hl.eval(g.eigenvalues), scores, None if t is None else t.drop( 'eigenvalues', 'scores')