示例#1
0
    def test_pca(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        eigenvalues, scores, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=True)

        self.assertEqual(len(eigenvalues), 2)
        self.assertTrue(isinstance(scores, hl.Table))
        self.assertEqual(scores.count(), 100)
        self.assertTrue(isinstance(loadings, hl.Table))
        self.assertEqual(loadings.count(), 100)

        _, _, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=False)
        self.assertEqual(loadings, None)
示例#2
0
def run_platform_pca(
    callrate_mt: hl.MatrixTable,
    binarization_threshold: Optional[float] = 0.25
) -> Tuple[List[float], hl.Table, hl.Table]:
    """
    Runs a PCA on a sample/interval MT with each entry containing the call rate.
    When `binzarization_threshold` is set, the callrate is transformed to a 0/1 value based on the threshold.
    E.g. with the default threshold of 0.25, all entries with a callrate < 0.25 are considered as 0s, others as 1s.

    :param callrate_mt: Input callrate MT
    :param binarization_threshold: binzarization_threshold. None is no threshold desired
    :return: eigenvalues, scores_ht, loadings_ht
    """
    logger.info("Running platform PCA")

    if binarization_threshold is not None:
        callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(
            callrate_mt.callrate > binarization_threshold))
    # Center until Hail's PCA does it for you
    callrate_mt = callrate_mt.annotate_rows(
        mean_callrate=hl.agg.mean(callrate_mt.callrate))
    callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate -
                                               callrate_mt.mean_callrate)
    eigenvalues, scores, loadings = hl.pca(
        callrate_mt.callrate, compute_loadings=True
    )  # TODO:  Evaluate whether computing loadings is a good / worthy thing
    logger.info("Platform PCA eigenvalues: {}".format(eigenvalues))

    return eigenvalues, scores, loadings
示例#3
0
def test_pca_against_numpy():
    mt = hl.import_vcf(resource('tiny_m.vcf'))
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = mt.annotate_rows(AC=hl.agg.sum(mt.GT.n_alt_alleles()),
                          n_called=hl.agg.count_where(hl.is_defined(mt.GT)))
    mt = mt.filter_rows((mt.AC > 0) & (mt.AC < 2 * mt.n_called)).persist()
    n_rows = mt.count_rows()

    def make_expr(mean):
        return hl.if_else(hl.is_defined(mt.GT),
                          (mt.GT.n_alt_alleles() - mean) /
                          hl.sqrt(mean * (2 - mean) * n_rows / 2), 0)

    eigen, scores, loadings = hl.pca(hl.bind(make_expr, mt.AC / mt.n_called),
                                     k=3,
                                     compute_loadings=True)
    hail_scores = scores.explode('scores').scores.collect()
    hail_loadings = loadings.explode('loadings').loadings.collect()

    assert len(eigen) == 3
    assert scores.count() == mt.count_cols()
    assert loadings.count() == n_rows

    assert len(scores.globals) == 0
    assert len(loadings.globals) == 0

    # compute PCA with numpy
    def normalize(a):
        ms = np.mean(a, axis=0, keepdims=True)
        return np.divide(
            np.subtract(a, ms),
            np.sqrt(2.0 * np.multiply(ms / 2.0, 1 - ms / 2.0) * a.shape[1]))

    g = np.pad(np.diag([1.0, 1, 2]), ((0, 1), (0, 0)), mode='constant')
    g[1, 0] = 1.0 / 3
    n = normalize(g)
    U, s, V = np.linalg.svd(n, full_matrices=0)
    np_scores = U.dot(np.diag(s)).flatten()
    np_loadings = V.transpose().flatten()
    np_eigenvalues = np.multiply(s, s).flatten()

    np.testing.assert_allclose(eigen, np_eigenvalues, rtol=1e-5)
    np.testing.assert_allclose(np.abs(hail_scores),
                               np.abs(np_scores),
                               rtol=1e-5)
    np.testing.assert_allclose(np.abs(hail_loadings),
                               np.abs(np_loadings),
                               rtol=1e-5)
示例#4
0
def main(args):
    hl.init(log='/platform_pca.log')

    if not args.skip_prepare_data_for_platform_pca:
        # ~1 hour on 800 cores (3/8/18)
        logger.info('Preparing data for platform PCA...')
        mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False)
        mt = filter_to_autosomes(mt)
        intervals = hl.import_locus_intervals(evaluation_intervals_path)
        mt = mt.annotate_rows(interval=intervals[mt.locus].target)
        mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2))
        mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct()))
        callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT)))
        callrate_mt.write(exome_callrate_mt_path, args.overwrite)

    if not args.skip_run_platform_pca:
        logger.info('Running platform PCA...')
        qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s')
        callrate_mt = hl.read_matrix_table(exome_callrate_mt_path)
        callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0)
        callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25))
        # Center until Hail's PCA does it for you
        callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate))
        callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate)
        eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False)
        logger.info('Eigenvalues: {}'.format(eigenvalues))
        # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647]
        scores.write(exome_callrate_scores_ht_path)

    logger.info('Annotating with platform PCs and known platform annotations...')
    scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes')
    if args.pc_scores_in_separate_fields:
        scores = scores.transmute(scores=[
            scores[ann] for ann in sorted(
                [ann for ann in scores.row if ann.startswith("PC")],
                key=lambda x: int(x[2:])
            )
        ])
    platform_pcs = assign_platform_pcs(scores)
    platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
示例#5
0
def test_blanczos_against_hail():
    k = 10

    def concatToNumpy(field, horizontal=True):
        blocks = field.collect()
        if horizontal:
            return np.concatenate(blocks, axis=0)
        else:
            return np.concatenate(blocks, axis=1)

    hl.utils.get_1kg('data/')
    hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)
    dataset = hl.read_matrix_table('data/1kg.mt')

    b_eigens, b_scores, b_loadings = hl._blanczos_pca(hl.int(
        hl.is_defined(dataset.GT)),
                                                      k=k,
                                                      q_iterations=3,
                                                      compute_loadings=True)
    b_scores = concatToNumpy(b_scores.scores)
    b_loadings = concatToNumpy(b_loadings.loadings)
    b_scores = np.reshape(b_scores, (len(b_scores) // k, k))
    b_loadings = np.reshape(b_loadings, (len(b_loadings) // k, k))

    h_eigens, h_scores, h_loadings = hl.pca(hl.int(hl.is_defined(dataset.GT)),
                                            k=k,
                                            compute_loadings=True)
    h_scores = np.reshape(concatToNumpy(h_scores.scores), b_scores.shape)
    h_loadings = np.reshape(concatToNumpy(h_loadings.loadings),
                            b_loadings.shape)

    # equation 12 from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4827102/pdf/main.pdf
    def bound(vs, us):
        return 1 / k * sum([np.linalg.norm(us.T @ vs[:, i]) for i in range(k)])

    MEV = bound(h_loadings, b_loadings)

    np.testing.assert_allclose(b_eigens, h_eigens, rtol=0.05)
    assert MEV > 0.9