def _spectral_moments(A, num_moments, p=None, moment_samples=500, block_size=128): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) n = A.ncols if p is None: p = min(num_moments // 2, 10) # TODO: When moment_samples > n, we should just do a TSQR on A, and compute # the spectrum of R. assert moment_samples < n, '_spectral_moments: moment_samples must be smaller than num cols of A' G = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) Q1, R1 = hl.nd.qr(G)._persist() fact = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = hl.eval(fact.spectral_moments(num_moments, R1)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs return moments, stdevs
def mt_to_table_of_ndarray(entry_expr, block_size=16, return_checkpointed_table_also=False): check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('mt_to_table_of_ndarray/entry_expr', entry_expr) if entry_expr in mt._fields_inverse: field = mt._fields_inverse[entry_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: entry_expr}) mt = mt.select_cols().select_rows().select_globals() mt = mt.select_entries(x=mt[field]) def get_even_partitioning(ht, partition_size, total_num_rows): ht = ht.select().add_index("_even_partitioning_index") filt = ht.filter((ht._even_partitioning_index % partition_size == 0) | (ht._even_partitioning_index == (total_num_rows - 1))) interval_bounds = filt.select().collect() intervals = [] num_intervals = len(interval_bounds) for i in range(num_intervals - 2): intervals.append( hl.utils.Interval(start=interval_bounds[i], end=interval_bounds[i + 1], includes_start=True, includes_end=False)) last_interval = hl.utils.Interval( start=interval_bounds[num_intervals - 2], end=interval_bounds[num_intervals - 1], includes_start=True, includes_end=True) intervals.append(last_interval) return intervals ht = mt.localize_entries(entries_array_field_name="entries", columns_array_field_name="cols") ht = ht.select(xs=ht.entries.map(lambda e: e['x'])) temp_file_name = hl.utils.new_temp_file("mt_to_table_of_ndarray", "ht") ht = ht.checkpoint(temp_file_name) num_rows = ht.count() new_partitioning = get_even_partitioning(ht, block_size, num_rows) new_part_ht = hl.read_table(temp_file_name, _intervals=new_partitioning) grouped = new_part_ht._group_within_partitions("groups", block_size) A = grouped.select( ndarray=hl.nd.array(grouped.groups.map(lambda group: group.xs))) if return_checkpointed_table_also: return A, ht return A
def _hwe_normalized_blanczos(call_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on the Hardy-Weinberg-normalized genotype call matrix. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- >>> eigenvalues, scores, loadings = hl._hwe_normalized_blanczos(dataset.GT, k=5) Notes ----- This method specializes :func:`._blanczos_pca` for the common use case of PCA in statistical genetics, that of projecting samples to a small number of ancestry coordinates. Variants that are all homozygous reference or all homozygous alternate are unnormalizable and removed before evaluation. See :func:`._blanczos_pca` for more details. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('_blanczos_pca/entry_expr', call_expr) A = _make_tsm_from_call(call_expr, block_size, hwe_normalize=True) return _blanczos_pca(A, k, compute_loadings=compute_loadings, q_iterations=q_iterations, oversampling_param=oversampling_param, block_size=block_size)
def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters q = q_iterations L = k + oversampling_param n = A.take(1)[0].ndarray.shape[1] # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW U, S, V = hailBlanczos(A, G, k, q) scores = V.transpose() * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) lt = ht.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt else: return eigens, st, None
def pca(entry_expr, k=10, compute_loadings=False) -> Tuple[List[float], Table, Table]: r"""Run principal component analysis (PCA) on numeric columns derived from a matrix table. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl.pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('pca/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) # FIXME: remove once select_entries on a field is free if entry_expr in mt._fields_inverse: field = mt._fields_inverse[entry_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: entry_expr}) mt = mt.select_cols().select_rows().select_globals() t = (Table( ir.MatrixToTableApply( mt._mir, { 'name': 'PCA', 'entryField': field, 'k': k, 'computeLoadings': compute_loadings })).persist()) g = t.index_globals() scores = hl.Table.parallelize(g.scores, key=list(mt.col_key)) if not compute_loadings: t = None return hl.eval(g.eigenvalues), scores, None if t is None else t.drop( 'eigenvalues', 'scores')
def _blanczos_pca(A, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_blanczos_pca/entry_expr', A) A = _make_tsm(A, block_size) U, S, V = _reduced_svd(A, k, compute_loadings, q_iterations, k + oversampling_param) scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) return eigens, st, lt else: return eigens, st, None
def _pca_and_moments(A, k=10, num_moments=5, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128, moment_samples=100): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) # Set Parameters q = q_iterations L = k + oversampling_param n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_loadings) info("_reduced_svd: Computing local SVD") U, S, V = fact.reduced_svd(k) p = min(num_moments // 2, 10) # Generate random matrix G2 for moment estimation G2 = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) # Project out components in subspace fact.V, which we can compute exactly G2 = G2 - fact.V @ (fact.V.T @ G2) Q1, R1 = hl.nd.qr(G2)._persist() fact2 = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = fact2.spectral_moments(num_moments, R1) # Add back exact moments moments = moments_and_stdevs.moments + hl.nd.array([ fact.S.map(lambda x: x**(2 * i)).sum() for i in range(1, num_moments + 1) ]) moments_and_stdevs = hl.eval( hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) else: lt = None return eigens, st, lt, moments, stdevs