예제 #1
0
def get_test_genotypes_bm(chrom, genotype_bm_path):

    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    #

    if chrom == 'all':
        mt = get_filtered_mt_with_x()
    else:
        mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', ))

    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))

    #    if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'):
    #        samples = mt.s.take(10)
    #        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
    #        mt = mt.key_cols_by(userId=hl.int32(mt.s))
    #        mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True)
    #    else:
    #        samples_ht = hl.read_table(genotype_samples_ht_path)
    controls = hl.read_table(f'{scratch_dir}/genotype_samples_n10.ht')
    cases = hl.read_table(f'{scratch_dir}/genotype_samples_n10_cases.ht')
    samples_ht = cases.union(controls)
    mt = mt.filter_cols(hl.is_defined(samples_ht[hl.int32(mt.s)]))

    mt = mt.key_cols_by(userId=hl.int32(mt.s))
    print(mt.count())

    mt = mt.select_cols().select_rows()
    mt = mt.repartition(1000)
    BlockMatrix.write_from_entry_expr(mt.dosage,
                                      genotype_bm_path,
                                      overwrite=True)
예제 #2
0
def make_sumstats_bm(sumstats_bm_path, high_quality):
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    clump_mt = hl.read_matrix_table(
        get_clumping_results_path(high_quality_only=high_quality)).rename(
            {'pop': 'clump_pops'})
    mt = all_axis_join(meta_mt, clump_mt)
    mt = separate_results_mt_by_pop(mt,
                                    'clump_pops',
                                    'plink_clump',
                                    skip_drop=True)
    mt = separate_results_mt_by_pop(mt,
                                    'meta_analysis_data',
                                    'meta_analysis',
                                    skip_drop=True)
    mt = mt.filter_cols(mt.meta_analysis_data.pop == mt.clump_pops)
    mt = explode_by_p_threshold(mt).unfilter_entries()

    mt = mt.filter_cols((mt.description == 'Type 2 diabetes')
                        & (mt.p_threshold == 1))

    BlockMatrix.write_from_entry_expr(hl.or_else(
        mt.meta_analysis.BETA * hl.is_defined(mt.plink_clump.TOTAL) *
        hl.int(mt.meta_analysis.Pvalue < mt.p_threshold), 0.0),
                                      sumstats_bm_path,
                                      overwrite=True)
예제 #3
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
예제 #4
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
예제 #5
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
            a4 = BlockMatrix.read(path).to_numpy()
            self._assert_eq(a1, a4)
예제 #6
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

        BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm)

        # non-field expressions currently take a separate code path
        path2 = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x + 1, path2)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2))

        BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True)
        self._assert_eq(BlockMatrix.read(path2), bm + 2)
예제 #7
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

        BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm)

        # non-field expressions currently take a separate code path
        path2 = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x + 1, path2)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2))

        BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True)
        self._assert_eq(BlockMatrix.read(path2), bm + 2)
예제 #8
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

            BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            # non-field expressions currently take a separate code path
            BlockMatrix.write_from_entry_expr(mt.x + 1, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path))

            BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm + 2)
예제 #9
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr]
                        + [mt == x._indices.source
                           for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
               .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
예제 #10
0
import hail as hl
from hail.linalg import BlockMatrix

mt = hl.read_matrix_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt')

mt = mt.filter_cols(mt.super_population == 'EUR')
mt = hl.variant_qc(mt)
mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001)
                    & (mt.variant_qc.AF[1] > 0.001))

BlockMatrix.write_from_entry_expr(
    entry_expr=mt.GT.n_alt_alleles(),
    path=
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm',
    mean_impute=True,
    center=False,
    normalize=False,
    block_size=4096,
    overwrite=True)

bm = BlockMatrix.read(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm'
)

metadata = hl.struct(name='1000_Genomes_phase3_European_autosomes_maf_gt_001',
                     reference_genome='GRCh37',
                     n_rows=bm.n_rows,
                     n_cols=bm.n_cols,
                     block_size=bm.block_size)
예제 #11
0
    def _test_linear_mixed_model_low_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 500
        n_variants = 200
        n_orig_markers = 100
        n_culprits = 10
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      af_dist=hl.rand_unif(0.1, 0.9, seed=seed),
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')
        a_t_path = utils.new_temp_file(suffix='bm')

        BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path)

        a = BlockMatrix.read(a_t_path).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        # low rank computation of S, P
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n_samples / n_markers)
        p = (g_std @ (v / np.sqrt(sl))).T

        # compare with full rank S, P
        sk0, uk = np.linalg.eigh(k)
        sk = sk0[-n_eigenvectors:]
        pk = uk[:, -n_eigenvectors:].T
        assert np.allclose(sk, s)
        assert np.allclose(np.abs(pk), np.abs(p))

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s, y, x)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_eigenvectors
        assert model.low_rank

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()
        assert np.min(df_numpy['chi_sq']) > 0

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 10
        assert na_lmm.sum() <= 10
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 5

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8   # all p-values
예제 #12
0
파일: ldscore.py 프로젝트: bcajes/hail
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
예제 #13
0
def main(args):
    pop = args.pop
    num_pcs = 10
    basic_covars = ['sex', 'age', 'age2', 'age_sex', 'age2_sex']
    covariates = basic_covars + [f'PC{x}' for x in range(1, num_pcs + 1)]

    tmp_mt_path = f'{temp_bucket_7day}/{pop}.mt'
    tmp_bm_path = f'{temp_bucket_7day}/{pop}.bm'

    if args.write_mt:
        mt = get_filtered_mt(chrom='all',
                             pop=pop,
                             entry_fields=['dosage'],
                             min_mac=19,
                             filter_mac_instead_of_ac=True)
        mt_x = get_filtered_mt(chrom='X',
                               pop=pop,
                               entry_fields=['dosage'],
                               min_mac=19,
                               filter_mac_instead_of_ac=True)
        mt = mt.union_rows(mt_x)
        mt = mt.annotate_rows(AF=hl.agg.mean(mt.dosage) / 2)
        mt = mt.checkpoint(tmp_mt_path, overwrite=args.overwrite)
        n = mt.count()[1]

        # write variant indexes
        ht = mt.rows().select().add_index()
        ht = ht.annotate_globals(n_samples=n, pop=pop)
        ht.write(get_ld_variant_index_path(pop), overwrite=args.overwrite)
    else:
        mt = hl.read_matrix_table(tmp_mt_path)
        n = mt.count()[1]

    if args.write_bm:
        # convert mt to bm
        BlockMatrix.write_from_entry_expr(mt.dosage,
                                          tmp_bm_path,
                                          mean_impute=True,
                                          center=False,
                                          normalize=False,
                                          overwrite=args.overwrite)
    bm = BlockMatrix.read(tmp_bm_path)

    if args.compute_ld_matrix:
        print(f'BlockMatrix shape: {bm.shape}')

        # mean-center and normalize bm
        bm_norm = normalize_bm(bm)
        bm_norm = checkpoint_tmp(bm_norm)

        # take covariates (with intercept), make hat bms for FWL projection
        cov = mt.cols().select(*covariates).to_pandas().drop(['s'], axis=1)
        cov['Intercept'] = 1.0
        hat1 = cov.values
        hat2 = np.dot(np.linalg.inv(np.dot(cov.transpose(), cov)),
                      cov.transpose())
        bm_hat1 = checkpoint_tmp(BlockMatrix.from_numpy(hat1))
        bm_hat2 = checkpoint_tmp(BlockMatrix.from_numpy(hat2))

        # Cov-adjustement; conducting in three steps due to huge matrix operation
        bm_Z = checkpoint_tmp(bm_norm @ bm_hat1)
        bm_Z = checkpoint_tmp(bm_Z @ bm_hat2)
        bm_Z = checkpoint_tmp(bm_norm - bm_Z)

        # compute ld matrix with a specified radius
        bm_ldadj = (bm_Z @ bm_Z.T) / n
        starts_and_stops = hl.linalg.utils.locus_windows(mt.locus,
                                                         radius=args.radius,
                                                         _localize=False)
        bm_ldadj = bm_ldadj._sparsify_row_intervals_expr(starts_and_stops,
                                                         blocks_only=False)

        # sparcify to a triangle matrix
        bm_ldadj = bm_ldadj.sparsify_triangle()
        bm_ldadj = bm_ldadj.checkpoint(get_ld_matrix_path(pop),
                                       overwrite=args.overwrite,
                                       force_row_major=True)
    else:
        bm_ldadj = BlockMatrix.read(get_ld_matrix_path(pop))

    if args.write_ldsc_hm3_snplist:
        # Note: currently, this writes snplists for all the populations at once
        write_ldsc_hm3_snplist(overwrite=args.overwrite)

    if args.compute_ldscore:
        ht_ldscore = copmute_ldscore(mt.rows(),
                                     bm_ldadj,
                                     n,
                                     radius=args.ld_score_radius,
                                     out_name=get_ld_score_ht_path(pop),
                                     overwrite=args.overwrite)
        export_ldscore(ht_ldscore, pop)
예제 #14
0
def main(args):
    hl.init(default_reference='GRCh37', log='/prs.log',
            spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO', 'spark.hadoop.fs.gs.requester.pays.project.id': 'ukbb-diversepops-neale'})

    if args.prepare_sumstats_matrix:
        # get meta mt and separate by pop combo
        meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
        meta_mt = separate_results_mt_by_pop(meta_mt, 'meta_analysis_data', 'meta_analysis')
        meta_mt = meta_mt.annotate_cols(clump_pops=meta_mt.meta_analysis_data.pop)
        meta_mt = meta_mt.key_cols_by('clump_pops', *meta_mt.col_key)
        
        # get sumstats mt and separate by pop combo
        ss_mt = get_final_sumstats_mt_for_export()
        ss_mt = separate_results_mt_by_pop(ss_mt, 'pheno_data', 'summary_stats')
        ss_mt = ss_mt.annotate_cols(clump_pops=hl.array([ss_mt.pheno_data.pop]))
        ss_mt = ss_mt.key_cols_by(*meta_mt.col_key)
        
        # join meta results and sumstats mt
        # NOTE: union_cols() requires the same entry fields schema
        meta_mt = meta_mt.select_entries(BETA = meta_mt.meta_analysis.BETA,
                                         Pvalue = meta_mt.meta_analysis.Pvalue).select_cols().select_rows()
        ss_mt = ss_mt.select_entries(BETA = ss_mt.summary_stats.BETA,
                                     Pvalue = ss_mt.summary_stats.Pvalue).select_cols().select_rows()
        mt = meta_mt.union_cols(ss_mt)
        
        # filter to distinct cols
        # NOTE: distinct_by_col() does not allow a col key of type `list`
        mt = mt.annotate_cols(clump_pops_str = hl.delimit(mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in mt.col_key if k!='clump_pops']).distinct_by_col()
        mt = mt.distinct_by_col()
        
        # ensure that betas are not missing
        ss_mt = ss_mt.annotate_cols(clump_pops_str = hl.delimit(ss_mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in ss_mt.col_key if k!='clump_pops'])
        mt = mt.annotate_entries(BETA = hl.or_else(mt.BETA, ss_mt[mt.row_key, mt.col_key].BETA),
                                 Pvalue = hl.or_else(mt.Pvalue, ss_mt[mt.row_key, mt.col_key].Pvalue))
        
        # read clump mt and separate by pop combo
        clump_mt = hl.read_matrix_table(get_clumping_results_path(high_quality=args.high_quality, 
                                                                  max_pops=args.max_pops))
        if args.max_pops:
            # if max_pops=True, the clump_mt is already separated by pop
            # these steps are necessary to make downstream code usable for both max_pops=True/False
            clump_mt = clump_mt.annotate_entries(plink_clump = hl.struct(TOTAL = clump_mt.TOTAL))
            clump_mt = clump_mt.annotate_cols(pop_index = 0)
        else:
            clump_mt = separate_results_mt_by_pop(clump_mt, 'clump_pops', 'plink_clump', skip_drop=True)
        
        clump_mt = clump_mt.annotate_cols(clump_pops_str = hl.delimit(clump_mt.clump_pops))
        clump_mt = clump_mt.drop('clump_pops').key_cols_by(*mt.col_key)
        
        # join sumstats/meta-analysis with clump mt
        mt = all_axis_join(mt, clump_mt)
        
        mt = mt.filter_cols(hl.is_defined(mt.pop_index))
        
        print(f'\n\nMatrix dimensions (before explode by p-threshold): {mt.count()}\n')
        mt = explode_by_p_threshold(mt).unfilter_entries()
        # Write pheno data for later use
        mt.add_col_index('idx').key_cols_by('idx').cols().write(
            get_clump_sumstats_col_ht_path(high_quality=args.high_quality,
                                           max_pops=args.max_pops), 
            args.overwrite)
        BlockMatrix.write_from_entry_expr(
            hl.or_else(mt.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.Pvalue < mt.p_threshold), 0.0),
            get_clump_sumstats_bm_path(high_quality=args.high_quality,
                                        max_pops=args.max_pops), 
            args.overwrite)
        # 2020-06-25 01:49:32 Hail: INFO: Wrote all 7078 blocks of 28987534 x 3530 matrix with block size 4096.
        # If clump_mt is significantly smaller than meta_mt, consider putting that on the left of the join,
        # then filter the genotype matrix to only those SNPs (pilot would go from 28.9M -> 21.2M)

    if args.prepare_genotype_matrix:
        meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
        mt = get_filtered_mt_with_x()
        mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))
        # Write sample data for later use
        mt = mt.key_cols_by(userId=hl.int32(mt.s))
        mt.cols().add_index().write(genotype_samples_ht_path, args.overwrite)
        BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, args.overwrite)
        # 2020-06-25 19:18:14 Hail: INFO: Wrote all 764424 blocks of 28987534 x 441345 matrix with block size 4096.

    if args.compute_prs:
        sumstats_bm = BlockMatrix.read(get_clump_sumstats_bm_path(high_quality=args.high_quality, 
                                                                  max_pops=args.max_pops))
        genotype_bm = BlockMatrix.read(genotype_bm_path)
        mul_splits = 197 # sumstats_bm.shape[1]//10000*10
        sum_splits = 20 #int(mul_splits/10)
        assert mul_splits>10 # if not more than 10, sum_splits is not necessary
        prs_bm = tree_matmul_tree_matsum(genotype_bm.T, sumstats_bm, mul_splits=mul_splits, 
                                         sum_splits=sum_splits, path_prefix = f'{temp_bucket}/prs/tree_matmul{"_max_pops" if args.max_pops else ""}',
                                         read_if_exists = True)
        prs_bm.write(get_prs_bm_path(high_quality=args.high_quality,
                                     max_pops=args.max_pops), args.overwrite)

    if args.create_prs_mt:
        prs_bm = BlockMatrix.read(get_prs_bm_path(high_quality=args.high_quality,
                                                  max_pops=args.max_pops))
        pheno_ht = hl.read_table(get_clump_sumstats_col_ht_path(high_quality=args.high_quality,
                                                                max_pops=args.max_pops)).key_by('idx')
        samples_ht = hl.read_table(genotype_samples_ht_path).key_by('idx')
        # 10k partitions for 370 GB table (441k x 108k) = 37 MB/partition
        # 5014 partitions for 240 GB table (441k x 72k) = 48 MB/partition (max_pops)
        n_partitions = 15000 #int(1000*(pheno_ht.count()/72*5)//1000) # or hard code
        mt = BlockMatrix.to_matrix_table_row_major(prs_bm, n_partitions=n_partitions).rename({'element': 'score'}) 
        mt = mt.annotate_cols(**pheno_ht[mt.col_key]).key_cols_by(*PHENO_KEY_FIELDS)
        mt = mt.annotate_rows(**samples_ht[mt.row_key]).key_rows_by('userId')
        mt.write(get_prs_mt_path(high_quality=args.high_quality, 
                                 max_pops=args.max_pops), 
                 args.overwrite)

    if args.assess_prs:
        prs_mt = hl.read_matrix_table(get_prs_mt_path(high_quality=args.high_quality, 
                                                      max_pops=args.max_pops))
        pheno_mt = get_ukb_pheno_mt()  # TODO: fix all phenos to new keying scheme
        pheno_mt = pheno_mt.key_cols_by(
            **pheno_mt.col_key.annotate(modifier=hl.if_else(pheno_mt.trait_type == "biomarkers", "irnt", pheno_mt.modifier)))
        mt = prs_mt.annotate_entries(**pheno_mt[prs_mt.row_key, prs_mt.col_key])
        mt = mt.annotate_cols(description = pheno_mt.cols()[mt.col_key].description)
        for pop in POPS:
            mt_pop = mt.filter_rows(mt.pop==pop)
            mt_pop = mt_pop.annotate_cols(prs_corr=hl.agg.linreg(mt_pop.both_sexes, [1.0, mt_pop.score]))
            cols = mt_pop.cols()
            cols.select('description', 
                        'p_threshold',
                        clump_pops_str=hl.delimit(cols.clump_pops,'-'),
                        prs_corr_r2=cols.prs_corr.multiple_r_squared, 
                        prs_corr_pval=cols.prs_corr.p_value[1], 
                        prs_corr_n=cols.prs_corr.n).export(f'gs://ukbb-diverse-temp-30day/prs/assess_prs{"_max_pops" if args.max_pops else ""}.{pop}.tsv.gz')