Пример #1
0
def spectra_helper(spec_func):

    for triplet in dim_triplets:
        k, m, n = triplet
        min_dim = min(m, n)
        sigma = np.diag([spec_func(i + 1, k) for i in range(min_dim)])
        seed = 1025
        np.random.seed(seed)
        U = np.linalg.qr(np.random.normal(0, 1, (m, min_dim)))[0]
        V = np.linalg.qr(np.random.normal(0, 1, (n, min_dim)))[0]
        A = U @ sigma @ V.T
        mt_A = matrix_table_from_numpy(A)

        eigenvalues, scores, loadings = hl._blanczos_pca(mt_A.ent,
                                                         k=k,
                                                         oversampling_param=k,
                                                         compute_loadings=True,
                                                         q_iterations=4)
        singulars = np.sqrt(eigenvalues)
        hail_V = (np.array(scores.scores.collect()) / singulars).T
        hail_U = np.array(loadings.loadings.collect())
        approx_A = hail_U @ np.diag(singulars) @ hail_V
        norm_of_diff = np.linalg.norm(A - approx_A, 2)
        np.testing.assert_allclose(
            norm_of_diff,
            spec_func(k + 1, k),
            rtol=1e-02,
            err_msg=f"Norm test failed on triplet {triplet} ")
        np.testing.assert_allclose(singulars,
                                   np.diag(sigma)[:k],
                                   rtol=1e-01,
                                   err_msg=f"Failed on triplet {triplet}")
Пример #2
0
def test_blanczos_against_numpy():

    def concatToNumpy(field, horizontal=True):
        blocks = field.collect()
        if horizontal:
            return np.concatenate(blocks, axis=0)
        else:
            return np.concatenate(blocks, axis=1)

    mt = hl.import_vcf(resource('tiny_m.vcf'))
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = mt.annotate_rows(AC=hl.agg.sum(mt.GT.n_alt_alleles()),
                          n_called=hl.agg.count_where(hl.is_defined(mt.GT)))
    mt = mt.filter_rows((mt.AC > 0) & (mt.AC < 2 * mt.n_called)).persist()
    n_rows = mt.count_rows()

    def make_expr(mean):
        return hl.if_else(hl.is_defined(mt.GT),
                          (mt.GT.n_alt_alleles() - mean) / hl.sqrt(mean * (2 - mean) * n_rows / 2),
                          0)

    k = 3

    float_expr = make_expr(mt.AC / mt.n_called)

    eigens, scores_t, loadings_t = hl._blanczos_pca(float_expr, k=k, q_iterations=7, compute_loadings=True)
    A = np.array(float_expr.collect()).reshape((3, 4)).T
    scores = concatToNumpy(scores_t.scores)
    loadings = concatToNumpy(loadings_t.loadings)
    scores = np.reshape(scores, (len(scores) // k, k))
    loadings = np.reshape(loadings, (len(loadings) // k, k))

    assert len(eigens) == 3
    assert scores_t.count() == mt.count_cols()
    assert loadings_t.count() == n_rows
    np.testing.assert_almost_equal(A @ loadings, scores)

    assert len(scores_t.globals) == 0
    assert len(loadings_t.globals) == 0

    # compute PCA with numpy
    def normalize(a):
        ms = np.mean(a, axis=0, keepdims=True)
        return np.divide(np.subtract(a, ms), np.sqrt(2.0 * np.multiply(ms / 2.0, 1 - ms / 2.0) * a.shape[1]))

    g = np.pad(np.diag([1.0, 1, 2]), ((0, 1), (0, 0)), mode='constant')
    g[1, 0] = 1.0 / 3
    n = normalize(g)
    U, s, V = np.linalg.svd(n, full_matrices=0)
    np_loadings = V.transpose()
    np_eigenvalues = np.multiply(s, s)

    def bound(vs, us):  # equation 12 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4827102/pdf/main.pdf
        return 1/k * sum([np.linalg.norm(us.T @ vs[:,i]) for i in range(k)])

    np.testing.assert_allclose(eigens, np_eigenvalues, rtol=0.05)
    assert bound(np_loadings, loadings) > 0.9
Пример #3
0
def test_blanczos_against_hail():
    k = 10

    def concatToNumpy(field, horizontal=True):
        blocks = field.collect()
        if horizontal:
            return np.concatenate(blocks, axis=0)
        else:
            return np.concatenate(blocks, axis=1)

    hl.utils.get_1kg('data/')
    hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)
    dataset = hl.read_matrix_table('data/1kg.mt')

    b_eigens, b_scores, b_loadings = hl._blanczos_pca(hl.int(
        hl.is_defined(dataset.GT)),
                                                      k=k,
                                                      q_iterations=3,
                                                      compute_loadings=True)
    b_scores = concatToNumpy(b_scores.scores)
    b_loadings = concatToNumpy(b_loadings.loadings)
    b_scores = np.reshape(b_scores, (len(b_scores) // k, k))
    b_loadings = np.reshape(b_loadings, (len(b_loadings) // k, k))

    h_eigens, h_scores, h_loadings = hl.pca(hl.int(hl.is_defined(dataset.GT)),
                                            k=k,
                                            compute_loadings=True)
    h_scores = np.reshape(concatToNumpy(h_scores.scores), b_scores.shape)
    h_loadings = np.reshape(concatToNumpy(h_loadings.loadings),
                            b_loadings.shape)

    # equation 12 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4827102/pdf/main.pdf
    def bound(vs, us):
        return 1 / k * sum([np.linalg.norm(us.T @ vs[:, i]) for i in range(k)])

    MEV = bound(h_loadings, b_loadings)

    np.testing.assert_allclose(b_eigens, h_eigens, rtol=0.05)
    assert MEV > 0.9
Пример #4
0
def test_spectra():
    def make_spectral_matrix(index_func, k, m, n):
        sigma_dim = min(m, n)
        answer = np.zeros((m, n))
        for j in range(sigma_dim):
            answer[j, j] = index_func(j + 1, k)
        return answer

    def matrix_table_from_numpy(np_mat):
        rows, cols = np_mat.shape
        mt = hl.utils.range_matrix_table(rows, cols)
        mt = mt.annotate_globals(entries_global = np_mat)
        mt = mt.annotate_entries(ent = mt.entries_global[mt.row_idx, mt.col_idx])
        return mt

    # Defined for j >= 1
    def spec1(j, k):
        return 1/j

    def spec2(j, k):
        if j == 1:
            return 1
        if j <= k:
            return 2 * 10**-5
        else:
            return (10**-5) * (k + 1)/j

    def spec3(j, k):
        if j <= k:
            return 10**(-5*(j-1)/(k-1))
        else:
            return (10**-5)*(k+1)/j

    def spec4(j, k):
        if j <= k:
            return 10**(-5*(j-1)/(k-1))
        elif j == (k + 1):
            return 10**-5
        else:
            return 0

    def spec5(j, k):
        if j <= k:
            return 10**-5 + (1 - 10**-5)*(k - j)/(k - 1)
        else:
            return 10**-5 * math.sqrt((k + 1)/j)

    spectral_functions = [spec1, spec2, spec3, spec4, spec5]

    # k, m, n
    dim_triplets = [(10, 1000, 1000), (20, 1000, 1000), (10, 100, 200)]

    for triplet in dim_triplets:
        k, m, n = triplet
        for idx, spec_func in enumerate(spectral_functions):
            sigma = make_spectral_matrix(spec_func, k, m, n)
            seed = 1025
            np.random.seed(seed)
            U = np.linalg.qr(np.random.normal(0, 1, (m, m)))[0]
            V = np.linalg.qr(np.random.normal(0, 1, (n, n)))[0]
            A = U @ sigma @ V
            mt_A = matrix_table_from_numpy(A)

            eigenvalues, scores, loadings = hl._blanczos_pca(mt_A.ent, k=k, oversampling_param=k, compute_loadings=True, q_iterations=4)
            singulars = np.sqrt(eigenvalues)
            hail_V = (np.array(scores.scores.collect()) / singulars).T
            hail_U = np.array(loadings.loadings.collect())
            approx_A = hail_U @ np.diag(singulars) @ hail_V
            norm_of_diff = np.linalg.norm(A - approx_A, 2)
            np.testing.assert_allclose(norm_of_diff, spec_func(k + 1, k), rtol=1e-02, err_msg=f"Norm test failed on triplet {triplet} on spec{idx + 1}")
            np.testing.assert_allclose(singulars, np.diag(sigma)[:k], rtol=1e-01, err_msg=f"Failed on triplet {triplet} on spec{idx + 1}")