示例#1
0
 def test_read_stored_globals(self):
     ds = self.get_vds()
     ds = ds.annotate_globals(x=5, baz='foo')
     f = new_temp_file(suffix='vds')
     ds.write(f)
     t = hl.read_table(f + '/globals')
     self.assertTrue(ds.globals_table()._same(t))
示例#2
0
    def test_backward_compatability(self):
        import os

        all_values_table, all_values_matrix_table = create_all_values_datasets()

        table_dir = resource('backward_compatability/1.0.0/table')
        matrix_table_dir = resource('backward_compatability/1.0.0/matrix_table')

        n = 0
        i = 0
        f = os.path.join(table_dir, '{}.ht'.format(i))
        while os.path.exists(f):
            ds = hl.read_table(f)
            self.assertTrue(ds._same(all_values_table))
            i += 1
            f = os.path.join(table_dir, '{}.ht'.format(i))
            n += 1

        i = 0
        f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
        while os.path.exists(f):
            ds = hl.read_matrix_table(f)
            self.assertTrue(ds._same(all_values_matrix_table))
            i += 1
            f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
            n += 1

        self.assertEqual(n, 8)
示例#3
0
 def test_codecs_table(self):
     from hail.utils.java import scala_object
     codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
     rt = self.get_vds().rows()
     temp = new_temp_file(suffix='ht')
     for codec in codecs:
         rt.write(temp, overwrite=True, _codec_spec=codec.toString())
         rt2 = hl.read_table(temp)
         self.assertTrue(rt._same(rt2))
示例#4
0
 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))
示例#5
0
def test_large_number_of_fields(tmpdir):
    mt = hl.utils.range_table(100)
    mt = mt.annotate(**{
        str(k): k for k in range(1000)
    })
    f = tmpdir.join("foo.mt")
    assert_time(lambda: mt.count(), 5)
    assert_time(lambda: mt.write(str(f)), 5)
    mt = assert_time(lambda: hl.read_table(str(f)), 5)
    assert_time(lambda: mt.count(), 5)
示例#6
0
def read_expression(path):
    """Read an :class:`Expression` written with :meth:`.experimental.write_expression`.

   Example
   -------
   >>> hl.experimental.write_expression(hl.array([1, 2]), 'output/test_expression.he')
   >>> expression = hl.experimental.read_expression('output/test_expression.he')
   >>> hl.eval(expression)

   Parameters
   ----------

   path : :obj:`str`
       File to read.

   Returns
   -------
   :class:`Expression`
    """
    return hl.read_table(path).index_globals().expr
示例#7
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht
示例#8
0
def maximal_independent_set(i,
                            j,
                            keep=True,
                            tie_breaker=None,
                            keyed=True) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Examples
    --------
    Run PC-relate and compute pairs of closely related individuals:

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain:

    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(
    ...     hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain, preferring to keep cases over controls:

    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.key_by(
    ...        s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False)

    Notes
    -----

    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    Parameters
    ----------
    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.
    keyed : :obj:`bool`
        If ``True``, key the resulting table by the `node` field, this requires
        a sort.

    Returns
    -------
    :class:`.Table`
        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.
    """

    if i.dtype != j.dtype:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to have same type. "
            "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError(
            "'maximal_independent_set' expects an expression of 'Table'. Found {}"
            .format("expression of '{}'".format(source.__class__)
                    if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_variable('l', wrapped_node_t)
        r = construct_variable('r', wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_str = str(tie_breaker_expr._ir)
    else:
        t, _ = source._process_joins(i, j)
        tie_breaker_str = None

    edges = t.select(__i=i, __j=j).key_by().select('__i', '__j')
    edges_path = new_temp_file()
    edges.write(edges_path)
    edges = hl.read_table(edges_path)

    mis_nodes = Env.hail().utils.Graph.maximalIndependentSet(
        edges._jt.collect(), node_t._parsable_string(),
        joption(tie_breaker_str))

    nodes = edges.select(node=[edges.__i, edges.__j])
    nodes = nodes.explode(nodes.node)
    # avoid serializing `mis_nodes` from java to python and back to java
    nodes = Table._from_java(
        nodes._jt.annotateGlobal(mis_nodes,
                                 hl.tset(node_t)._parsable_string(),
                                 'mis_nodes'))
    nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep)
    nodes = nodes.select_globals()
    if keyed:
        return nodes.key_by('node')
    return nodes
示例#9
0
def prepare_variant_results(table_urls):
    annotations = None
    analysis_groups = []

    for annotations_table_url, results_table_url in table_urls:
        group_annotations = hl.import_table(
            annotations_table_url,
            force=True,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "in_analysis": hl.tbool,
                "gene_id": hl.tstr,
                "gene_name": hl.tstr,
                "transcript_id": hl.tstr,
                "hgvsc": hl.tstr,
                "hgvsp": hl.tstr,
                "csq_analysis": hl.tstr,
                "csq_worst": hl.tstr,
                "mpc": hl.tfloat,
                "polyphen": hl.tstr,
            },
        )

        group_results = hl.import_table(
            results_table_url,
            force=True,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "analysis_group": hl.tstr,
                "ac_case": hl.tint,
                "an_case": hl.tstr,
                "af_case": hl.tstr,
                "ac_ctrl": hl.tint,
                "an_ctrl": hl.tstr,
                "af_ctrl": hl.tstr,
            },
        )

        groups_in_table = group_results.aggregate(
            hl.agg.collect_as_set(group_results.analysis_group))
        assert len(groups_in_table) == 1, groups_in_table
        group_name = groups_in_table.pop()
        analysis_groups.append(group_name)

        group_results = group_results.annotate(
            an_case=hl.int(group_results.an_case),
            af_case=hl.float(group_results.af_case),
            an_ctrl=hl.int(group_results.an_ctrl),
            af_ctrl=hl.float(group_results.af_ctrl),
            in_analysis=group_annotations[group_results.v].in_analysis,
        )

        group_results.drop("analysis_group").write(f"temp_{group_name}.ht")

        group_annotations = group_annotations.drop("in_analysis")

        if annotations is None:
            annotations = group_annotations
        else:
            annotations = annotations.union(group_annotations)

    annotations = annotations.distinct()

    annotations = annotations.annotate(
        filters="PASS",
        csq_analysis=hl.sorted(annotations.csq_analysis.split(","),
                               lambda c: consequence_term_rank(c))[0],
        csq_worst=hl.sorted(annotations.csq_worst.split(","),
                            lambda c: consequence_term_rank(c))[0],
        canonical_transcript_id=annotations.transcript_id,
        hgvsc_canonical=annotations.hgvsc,
        hgvsp_canonical=annotations.hgvsp,
    )

    annotations = annotations.annotate(
        locus=hl.locus(
            annotations.v.split(":")[0], hl.int(annotations.v.split(":")[1])),
        alleles=annotations.v.split(":")[2:4],
    )

    annotations = annotations.annotate(
        variant_id=variant_id(annotations.locus, annotations.alleles),
        chrom=annotations.locus.contig,
        pos=annotations.locus.position,
        xpos=x_position(annotations.locus),
        alt=annotations.alleles[1],
        ref=annotations.alleles[0],
    )

    annotations = annotations.drop("locus", "alleles")

    annotations = annotations.annotate(groups=hl.struct())
    for group_name in analysis_groups:
        results = hl.read_table(f"temp_{group_name}.ht")
        annotations = annotations.annotate(groups=annotations.groups.annotate(
            **{group_name: results[annotations.key]}))

    annotations = annotations.key_by().drop("v")

    return annotations
示例#10
0
def tx_annotate_mt(mt,
                   gtex,
                   tx_annotation_type,
                   tissues_to_filter=v7_tissues_to_drop,
                   gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path,
                   filter_to_csqs=all_coding_csqs,
                   filter_to_genes=None,
                   gene_column_in_mt=None,
                   filter_to_homs=False,
                   out_tx_annotation_tsv=None,
                   out_tx_annotation_ht=None):
    """
    Annotate variants in the input MatrixTable with transcript-based expression values accross GTEx. Returns Table.

    :param MatrixTable mt: Input variant file
    :param MatrixTable gtex: Input GTEx summary MatrixTable, must have transcript_id column to key by
    :param str tx_annotation_type: One of ["expression", "proportion"]. Select proportion if you'd like the
    tx_annotation values to be normalized by max expression of the gene
    :param None or list filter_to_csqs: Default None. If you'd like to filter the mt before annotating
    (decreases time) feed in a list or set of consequence terms.
    :param str gene_column_in_mt: Must be set if filter_to_genes != None.
    Column in matrix table that contains gene information within vep.transcript_consequences.
    often ["gene_id", "gene_symbol"]
    :param None or list filter_to_csqs: Default None. If you'd like to filter the mt before annotating
    (decreases time) feed in a list or set of consequence terms.
    Example = ["stop_gained","splice_donor_variant", "splice_acceptor_variant","frameshift_variant"]
    :param None or str out_tx_annotation_tsv: Default None.
    If you'd like to write out the results table as a tsv, provide a tsv path
    :param None or str out_tx_annotation_ht: Default None.
    If you'd like to write out the results table as a Hail 0.2 table, provide a .ht path
    :param bool filter_to_homs: Default False
    If True, filter to variants with at least one homozygote in dataset
    :return: Table with columns: variant, worst_csq, ensg, LOFTEE LOF, LOFTEE LOF Flag, transcript-aware expression
    by GTEx Tissue
    :rtype: Table with variants annotated with transcript-aware tissue expression
    """

    #check_inputs(**locals())

    gtex_table = gtex.key_by("transcript_id")

    #mt = process_consequences(mt, penalize_flags=False)
    mt_exploded = mt.distinct_by_row()
    mt_exploded = mt_exploded.annotate_rows(vep=mt_exploded.vep.annotate(
        transcript_consequences=mt_exploded.vep.transcript_consequences.map(
            add_most_severe_consequence_to_consequence)))

    # Explode the mt for the transcript consequences to be able to key by transcript ID
    mt_exploded = mt_exploded.explode_rows(
        mt_exploded.vep.transcript_consequences)

    mt_kt = mt_exploded.rows()
    # Currently testing removal of protein coding transcripts
    mt_kt = mt_kt.filter(
        mt_kt.vep.transcript_consequences.biotype == "protein_coding")

    if filter_to_genes:
        print("Filtering to genes of interest")
        mt_kt = filter_table_to_gene_list(mt_kt, filter_to_genes,
                                          gene_column_in_mt)

    if filter_to_csqs:
        print("Filtering to csqs in %s" % (",".join(filter_to_csqs)))
        mt_kt = filter_table_to_csqs(mt_kt, filter_to_csqs)

    if filter_to_homs:
        print(
            "Filtering to variants with at least 1 homozygote sample in dataset"
        )
        #mt_kt = mt_kt.filter(mt_kt.info.Hom[mt_kt.a_index - 1] > 0)
        idx = mt_kt.globals.freq_index_dict['gnomad']
        mt_kt = mt_kt.filter(mt_kt.freq[idx].homozygote_count >= 1)

    # Annotate mt with the gtex values (ie. join them)
    mt_kt = mt_kt.annotate(
        tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id])

    # Group by gene, worst_csq and variant, and do a pairwise-sum
    grouped_table = (mt_kt.group_by(
        csq=mt_kt.vep.transcript_consequences.most_severe_consequence,
        ensg=mt_kt.vep.transcript_consequences.gene_id,
        symbol=mt_kt.vep.transcript_consequences.gene_symbol,
        locus=mt_kt.locus,
        alleles=mt_kt.alleles,
        lof=mt_kt.vep.transcript_consequences.lof,
        lof_flag=mt_kt.vep.transcript_consequences.lof_flags).aggregate(
            tx_annotation=hl.agg.array_sum(mt_kt.tx_data.agg_expression)))

    # Expand the columns from the arrays and add tissues as headers
    #tissue_ids = gtex.tissue.collect()
    # Since gtex no longer has .tissue just a new way to do this, i probably want to save it as a global at some point
    tissue_ids = sorted([y.tissue for y in gtex.values.take(1)[0]])
    d = {tiss: i for i, tiss in enumerate(tissue_ids)}

    tx_annotation_table = grouped_table.annotate(
        **{
            tissue_id.replace("-", "_").replace(" ", "_").replace("(", "_").
            replace(")", "_"): grouped_table.tx_annotation[d[tissue_id]]
            for tissue_id in tissue_ids
        })

    tx_annotation_table = tx_annotation_table.drop(
        tx_annotation_table.tx_annotation)

    # First of all do you want proportions or expression?
    if tx_annotation_type == "proportion":
        print("Returning expression proportion")
        gene_maximums_ht = hl.read_table(gene_maximums_ht_path)
        tx_annotation_table = get_expression_proportion(
            tx_annotation_table, tissues_to_filter, gene_maximums_ht)

    #You can write the output that is exploded by variants-ensg-csq-symbol-LOFTEE-LOFTEEflag
    # and has a value for each tissue as column, either as a TSV or a KT

    if out_tx_annotation_tsv:
        print("Writing tsv file to %s" % out_tx_annotation_tsv)
        tx_annotation_table.export(out_tx_annotation_tsv)

    if out_tx_annotation_ht:
        print("Writing Table to %s" % out_tx_annotation_ht)
        tx_annotation_table.write(out_tx_annotation_ht)

    tx_annotation_table = tx_annotation_table.key_by(
        tx_annotation_table.locus, tx_annotation_table.alleles)
    tx_annotation_table = tx_annotation_table.collect_by_key('tx_annotation')
    mt = mt.annotate_rows(**tx_annotation_table[mt.locus, mt.alleles])

    return mt
示例#11
0
def prepare_mitochondrial_variants(path, mnvs_path=None):
    ds = hl.read_table(path)

    haplogroups = hl.eval(ds.globals.hap_order)

    ds = ds.annotate(hl_hist=ds.hl_hist.annotate(
        bin_edges=ds.hl_hist.bin_edges.map(
            lambda n: hl.float(hl.format("%.2f", n)))))

    filter_names = hl.dict({
        "artifact_prone_site": "Artifact-prone site",
        "indel_stack": "Indel stack",
        "npg": "No passing genotype"
    })

    ds = ds.select(
        # ID
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome=ds.locus.dtype.reference_genome.name,
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        # Quality
        filters=ds.filters.map(lambda f: filter_names.get(f, f)),
        qual=ds.qual,
        genotype_quality_metrics=[
            hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all)
        ],
        genotype_quality_filters=[
            hl.struct(
                name="Base Quality",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.base_qual_hist),
            ),
            hl.struct(
                name="Contamination",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.contamination_hist),
            ),
            hl.struct(
                name="Heteroplasmy below 10%",
                filtered=hl.struct(
                    bin_edges=ds.hl_hist.bin_edges,
                    bin_freq=ds.heteroplasmy_below_10_percent_hist),
            ),
            hl.struct(name="Position",
                      filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                         bin_freq=ds.position_hist)),
            hl.struct(
                name="Strand Bias",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.strand_bias_hist),
            ),
            hl.struct(
                name="Weak Evidence",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.weak_evidence_hist),
            ),
        ],
        site_quality_metrics=[
            hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)),
            hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)),
            hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)),
        ],
        # Frequency
        an=ds.AN,
        ac_hom=ds.AC_hom,
        ac_het=ds.AC_het,
        excluded_ac=ds.excluded_AC,
        # Heteroplasmy
        common_low_heteroplasmy=ds.common_low_heteroplasmy,
        heteroplasmy_distribution=ds.hl_hist,
        max_heteroplasmy=ds.max_hl,
        # Haplogroups
        hapmax_af_hom=ds.hapmax_AF_hom,
        hapmax_af_het=ds.hapmax_AF_het,
        faf_hapmax_hom=ds.faf_hapmax_hom,
        haplogroup_defining=ds.hap_defining_variant,
        haplogroups=[
            hl.struct(
                id=haplogroup,
                an=ds.hap_AN[i],
                ac_het=ds.hap_AC_het[i],
                ac_hom=ds.hap_AC_hom[i],
                faf_hom=ds.hap_faf_hom[i],
                heteroplasmy_distribution=ds.hap_hl_hist[i],
            ) for i, haplogroup in enumerate(haplogroups)
        ],
        # Other
        age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom),
        flags=hl.set([
            hl.or_missing(ds.common_low_heteroplasmy,
                          "common_low_heteroplasmy")
        ]).filter(hl.is_defined),
        mitotip_score=ds.mitotip_score,
        mitotip_trna_prediction=ds.mitotip_trna_prediction,
        pon_ml_probability_of_pathogenicity=ds.
        pon_ml_probability_of_pathogenicity,
        pon_mt_trna_prediction=ds.pon_mt_trna_prediction,
        variant_collapsed=ds.variant_collapsed,
        vep=ds.vep,
    )

    if mnvs_path:
        mnvs = hl.import_table(mnvs_path,
                               types={
                                   "pos": hl.tint,
                                   "ref": hl.tstr,
                                   "alt": hl.tstr,
                                   "AC_hom_MNV": hl.tint
                               })
        mnvs = mnvs.key_by(
            locus=hl.locus("chrM",
                           mnvs.pos,
                           reference_genome=ds.locus.dtype.reference_genome),
            alleles=[mnvs.ref, mnvs.alt],
        )
        ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0))
        ds = ds.annotate(
            flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags))

    return ds
示例#12
0
# path for Julia's sample metadata file
jul_metadata_path = (
    'gs://hgdp_tgp/output/gnomad_v3.1_sample_qc_metadata_hgdp_tgp_subset.ht')

# path for variant qc info
var_metadata_path = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht'

# path for Konrad's densified matrix table
dense_mt_path = 'gs://hgdp_tgp/output/tgp_hgdp.mt'

# reading in Alicia's sample metadata file (Note: this file uses the 'v3.1::' prefix as done in gnomAD)
sample_meta = hl.import_table(sample_metadata_path, impute=True)

# reading in Julia's sample metadata file
jul_meta = hl.read_table(jul_metadata_path)

# reading in variant qc information
var_meta = hl.read_table(var_metadata_path)

# reading in densified matrix table
dense_mt = hl.read_matrix_table(dense_mt_path)

# These bits below were written by Tim Poterba to help troubleshoot unflattening a ht with nested structure
# dict to hold struct names as well as nested field names
d = {}

# Getting just the row field names
row = sample_meta.row_value

# returns a dict with the struct names as keys and their inner field names as values
示例#13
0
 def compatible_checkpoint(obj, path):
     obj.write(path, overwrite=True)
     return hl.read_table(path)
示例#14
0
        f'Invalid sex argument "{sex}" - must be one of {{"both_sexes", "female", "male"}}.'
    )
if contig not in set(['autosomes', 'chrX', 'chrXY']):
    raise ValueError(
        f'Invalid contig argument "{contig}" - must be one of {{"autosomes", "chrX", "chrXY"}}.'
    )

try:
    dilution = sys.argv[4]
except:
    dilution = False
else:
    dilution = True

ht_phenotypes = hl.read_table(
    f'gs://ukb31063-mega-gwas/biomarkers/pipelines/ukb31063.biomarkers_gwas.{sex}.pipeline_{pipeline}.ht'
)
ht_covariates = hl.read_table(
    f'gs://ukb31063/hail/ukb31063.neale_gwas_covariates.{sex}.ht')
ht_variants = hl.read_table(
    'gs://ukb31063/hail/ukb31063.neale_gwas_variants.ht')

if dilution:
    ht = hl.read_table(f'gs://ukb31063/hail/ukb31063.biomarkers_gwas.{sex}.ht')
    ht = ht.select('estimated_sample_dilution_factor_raw')
    ht_covariates = ht_covariates.annotate(estimated_sample_dilution_factor=ht[
        ht_covariates.s]['estimated_sample_dilution_factor_raw'])

if contig == 'autosomes':
    contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}'
else:
示例#15
0
def load_dataset(name,
                 version,
                 reference_genome,
                 config_file='gs://hail-datasets/datasets.json'):
    """Load a genetic dataset from Hail's repository.

    Example
    -------

    >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates
    >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes',   # doctest: +SKIP
    ...                                       version='phase3',
    ...                                       reference_genome='GRCh38')

    Parameters
    ----------
    name : :obj:`str`
        Name of the dataset to load.
    version : :obj:`str`
        Version of the named dataset to load
        (see available versions in documentation).
    reference_genome : `GRCh37` or `GRCh38`
        Reference genome build.

    Returns
    -------
    :class:`.Table` or :class:`.MatrixTable`"""

    with hl.hadoop_open(config_file, 'r') as f:
        datasets = json.load(f)

    names = set([dataset['name'] for dataset in datasets])
    if name not in names:
        raise ValueError(
            '{} is not a dataset available in the repository.'.format(
                repr(name)))

    versions = set([
        dataset['version'] for dataset in datasets if dataset['name'] == name
    ])
    if version not in versions:
        raise ValueError("""Version {0} not available for dataset {1}.
                            Available versions: {{{2}}}.""".format(
            repr(version), repr(name), repr('","'.join(versions))))

    reference_genomes = set([
        dataset['reference_genome'] for dataset in datasets
        if dataset['name'] == name
    ])
    if reference_genome not in reference_genomes:
        raise ValueError(
            """Reference genome build {0} not available for dataset {1}.
                            Available reference genome builds: {{'{2}'}}.""".
            format(repr(reference_genome), repr(name), '\',\''.join(
                (reference_genomes))))

    path = [
        dataset['path'] for dataset in datasets if all([
            dataset['name'] == name, dataset['version'] == version,
            dataset['reference_genome'] == reference_genome
        ])
    ][0].strip('/')

    if path.endswith('.ht'):
        dataset = hl.read_table(path)
    else:
        if not path.endswith('.mt'):
            raise ValueError(
                'Invalid path {}: can only load datasets with .ht or .mt extensions.'
                .format(repr(path)))
        dataset = hl.read_matrix_table(path)

    return dataset
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory
    tmp_dir = "hdfs://spark-master:9820/"
    temp_dir = "file:///home/ubuntu/data/tmp"
    plot_dir = "/home/ubuntu/data/tmp"
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])
    n_partitions = 500
    omni = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_omni2.5.hg38.ht'
    omni_ht = hl.read_table(omni)
    mills = f'{temp_dir}/ddd-elgh-ukbb/training_sets/Mills_and_1000G_gold_standard.indels.hg38.ht'
    mills_ht = hl.read_table(mills)
    thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht'
    thousand_genomes_ht = hl.read_table(thousand_genomes)
    hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht'
    hapmap_ht = hl.read_table(hapmap)
    # ANNOTATION TABLES:
    truth_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht')
    trio_stats_table = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht')
    #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht')
    allele_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_allele_data.ht')
    allele_counts_ht = hl.read_table(
示例#17
0
def get_overlapping_phenos(apcdr_ukb, gwas_phenos, gwas_biomarkers,
                           pheno_table, overwrite):
    # get which phenotypes exist in apcdr data
    pheno_gwas = hl.import_table(apcdr_ukb)
    pheno_gwas = {
        row['pheno_code']: row['ukb_code']
        for row in pheno_gwas.collect()
    }
    incl_whr = False
    if 'WHR' in pheno_gwas:
        del pheno_gwas['WHR']
        incl_whr = True

    if 'EA' in pheno_gwas:
        educ = hl.import_table(
            'gs://ukb-diverse-pops/Phenotypes/Everyone/PHESANT_final_output/January_2020_plus_pharma_and_updated_codings/phesant_output_multi_ancestry_combined_both_sexes_no_sex_specific_educ_att_years.tsv',
            missing='',
            impute=True,
            min_partitions=100,
            key='userId',
            types={
                'userId': hl.tstr,
                'EDUC_ATT_CAT_ORD': hl.tint
            })

    # read ukb data
    ht_phenotypes = hl.import_table(gwas_phenos,
                                    force_bgz=True,
                                    missing='',
                                    impute=True,
                                    min_partitions=100,
                                    types={'s': hl.tstr},
                                    key='s')

    phenotype_cols = set(ht_phenotypes.row)
    irnt = []
    raw_phenos = []
    biomarker = []
    for pheno in pheno_gwas.values():
        if pheno + '_irnt' in phenotype_cols:
            irnt.append(pheno)
        elif pheno in phenotype_cols:
            raw_phenos.append(pheno)
        elif pheno + '_0' in phenotype_cols:
            raw_phenos.append(pheno + '_0')
        else:
            biomarker.append(pheno)

    print(pheno_gwas.values())
    if incl_whr:
        irnt.append('whr')

        ht_phenotypes = ht_phenotypes.annotate(whr=ht_phenotypes['48_raw'] /
                                               ht_phenotypes['49_raw'])
        ht_phenotypes = irnt_funct(ht_phenotypes.whr, 'whr_irnt').key_by('s')

    if 'EA' in pheno_gwas:
        ht_phenotypes = ht_phenotypes.annotate(
            ea=educ[ht_phenotypes.key].EDUC_ATT_CAT_ORD)
        raw_phenos.append('ea')
        print('adding EA')

    # now select phenotypes that are in apcdr data
    ht_phenos = ht_phenotypes.select(*[x + '_irnt' for x in irnt] + raw_phenos)
    ht_phenos.show()

    # filter biomarkers to codes
    biomarkers = [
        'cholesterol_irnt', 'hdl_cholesterol_irnt', 'ldl_irnt',
        'triglycerides_irnt', 'albumin_irnt', 'alkaline_phosphatase_irnt',
        'alanine_aminotransferase_irnt', 'aspartate_aminotransferase_irnt',
        'direct_bilirubin_irnt', 'gamma_glutamyltransferase_irnt',
        'glycated_haemoglobin_irnt'
    ]
    if gwas_biomarkers:
        ht_biomarkers = hl.read_table(gwas_biomarkers).select(*biomarkers)

        # now join biomarkers with phenotypes
        ht_all_phenos = ht_phenos.join(ht_biomarkers)
        ht_all_phenos.write(pheno_table, overwrite=args.overwrite)
    else:
        ht_phenos.write(pheno_table, overwrite=args.overwrite)
示例#18
0
def main(args):
    # get phenotypes that overlap with APCDR dataset
    if args.write_phenos:
        get_overlapping_phenos(args.pheno_ukb_codes, args.gwas_phenos,
                               args.gwas_biomarkers, args.pheno_table,
                               args.overwrite)

    ht_phenos = hl.read_table(args.pheno_table)

    ht_covariates = hl.read_table(args.gwas_covariates)
    ht_variants = hl.read_table(args.gwas_variants)
    ht_samples = hl.import_table(args.gwas_samples,
                                 types={'s': hl.tstr},
                                 key='s')

    contig = 'autosomes'
    contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}'

    # import ukb bgen files
    mt = hl.import_bgen(
        path=
        f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
        sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
        entry_fields=['dosage'],
        variants=ht_variants)

    # add phenotype and covariate info
    mt = mt.annotate_cols(phenotypes=ht_phenos[mt.s],
                          covariates=ht_covariates[mt.s])

    # filter keeping samples in gwas, i.e. exclude holdout samples
    # mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'TRUE')
    disc_or_target = 'target'
    if disc_or_target == 'target':
        mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'FALSE')  # target
    else:
        mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'TRUE')  # discovery

    # mt.write(args.mt, overwrite=args.overwrite)
    #
    # mt = hl.read_matrix_table(args.mt)

    phenotypes = list(mt['phenotypes'].keys())

    pheno1 = phenotypes[0:10]
    pheno2 = phenotypes[10:20]
    pheno3 = phenotypes[20:30]
    pheno4 = phenotypes[30:len(phenotypes)]
    pheno_leftover = ['whr_irnt', 'glycated_haemoglobin_irnt']

    # run_grouped_regressions(mt, args.holdout_ss_output, pheno1, 'pheno1')
    # run_grouped_regressions(mt, args.holdout_ss_output, pheno2, 'pheno2')
    # run_grouped_regressions(mt, args.holdout_ss_output, pheno3, 'pheno3')
    # run_grouped_regressions(mt, args.holdout_ss_output, pheno4, 'pheno4')
    # run_grouped_regressions(mt, args.holdout_ss_output, pheno_leftover, 'pheno_leftover')
    if disc_or_target == 'target':
        run_grouped_regressions(mt, args.holdout_ss_output, phenotypes,
                                'target_holdout2')
    else:
        run_grouped_regressions(mt, args.holdout_ss_output, phenotypes,
                                'discovery')
示例#19
0
import argparse

import hail as hl

p = argparse.ArgumentParser()
p.add_argument("--input-url", required=True)
p.add_argument("--genes-url", required=True)
p.add_argument("--output-url", required=True)
args = p.parse_args()

hl.init(log="/tmp/hail.log")

ds = hl.read_table(args.input_url)

ds = ds.annotate(analysis_group="meta")

genes = hl.read_table(args.genes_url)
genes = genes.key_by("gene_id")
ds = ds.annotate(chrom=genes[ds.gene_id].chrom, pos=genes[ds.gene_id].start)

ds.write(args.output_url)
import hail as hl

ht_samples = hl.read_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_samples.ht')
ht_relationships = hl.read_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_sample_relationships.ht')

mt = hl.import_vcf(
    'gs://hail-datasets-raw-data/1000_Genomes/1000_Genomes_phase3_chrY_GRCh37.vcf.bgz',
    reference_genome='GRCh37')

mt = mt.annotate_cols(**ht_samples[mt.s])
mt = mt.annotate_cols(**ht_relationships[mt.s])

mt_split = hl.split_multi(mt)
mt_split = mt_split.select_entries(
    GT=hl.downcode(mt_split.GT, mt_split.a_index))
mt_split = mt_split.annotate_rows(info=hl.struct(
    DP=mt_split.info.DP,
    END=mt_split.info.END,
    SVTYPE=mt_split.info.SVTYPE,
    AA=mt_split.info.AA,
    AC=mt_split.info.AC[mt_split.a_index - 1],
    AF=mt_split.info.AF[mt_split.a_index - 1],
    NS=mt_split.info.NS,
    AN=mt_split.info.AN,
    EAS_AF=mt_split.info.EAS_AF[mt_split.a_index - 1],
    EUR_AF=mt_split.info.EUR_AF[mt_split.a_index - 1],
    AFR_AF=mt_split.info.AFR_AF[mt_split.a_index - 1],
    AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1],
    SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1],
def main():

    print("main")

    run_hash = "91ba5f38"
    ht=hl.read_table(f'{lustre_dir}/variant_qc/models/{run_hash}_score_binning.ht')

    mt = hl.read_matrix_table(
        f'{lustre_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt')


    table_cohort = hl.import_table(
        f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", delimiter="\t").key_by('s')

    mt = mt.annotate_cols(cohort=table_cohort[mt.s].cohort)
    df = pd.read_csv(
        f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", sep="\t")
    cohorts_array = df.cohort.unique()

    mt = mt.annotate_rows(
        MAF_cohorts=hl.agg.group_by(mt.cohort,
                                    hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AF))
    )
    mt = mt.annotate_rows(
        AN_cohorts=hl.agg.group_by(mt.cohort,
                                   hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AN))
    )

    mt = mt.annotate_rows(
        AC_cohorts=hl.agg.group_by(mt.cohort,
                                   hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AC))
    )

    mt = mt.annotate_rows(
        missingness_cohorts=hl.agg.group_by(mt.cohort, hl.min(
            (hl.agg.count_where(hl.is_missing(mt['GT']))) / mt.count_rows()*2))

    )

    mt = mt.annotate_rows(
        info=mt.info.annotate(cohort_names=mt.MAF_cohorts.keys())
    )
    mt = mt.annotate_rows(
        info=mt.info.annotate(MAF_cohorts_values=mt.MAF_cohorts.values())
    )

    mt = mt.annotate_rows(
        info=mt.info.annotate(AN_cohorts_values=mt.AN_cohorts.values())
    )

    mt = mt.annotate_rows(
        info=mt.info.annotate(AC_cohorts=mt.AC_cohorts.values())
    )

    mt = mt.annotate_rows(
        info=mt.info.annotate(
            missingness_cohorts_values=mt.missingness_cohorts.values())
    )

    mt = mt.annotate_rows(
        Variant_Type=hl.cond((hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP",
                             hl.cond(
            hl.is_insertion(
                mt.alleles[0], mt.alleles[1]),
            "INDEL",
            hl.cond(hl.is_deletion(mt.alleles[0],
                                   mt.alleles[1]), "INDEL",
                    "Other"))))
    mt = mt.annotate_rows(
        info=mt.info.annotate(
            rf_probability=ht[mt.row_key].rf_probability['TP'])
    )
    mt = mt.annotate_rows(
        info=mt.info.annotate(score=ht[mt.row_key].score)
    )
    mt = mt.annotate_rows(
        info=mt.info.annotate(bin=ht[mt.row_key].bin)
    )
    
    filter_column_annotation = (
        hl.case()
        .when(((mt.Variant_Type == "SNP") & (mt.info.bin <= SNV_PASS_BIN)), "PASS")
        .when(((mt.Variant_Type == "INDEL") & (mt.info.bin <= INDEL_PASS_BIN)), "PASS")
        .default(".")  # not pass for rest
    )

# mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation)
    mt1 = mt.annotate_rows(
        filtercol=((filter_column_annotation))
    )
    mt_fail = mt1.filter_rows(mt1.filtercol == ".")
    print(mt_fail.count())

    mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol))
    mt_fail2 = mt2.filter_rows(mt2.filters.contains("."))
    mt_pass = mt2.filter_rows(mt2.filters.contains("PASS"))
    print(f'Failed:{mt_fail2.count()}')
    print(f'Passed:{mt_pass.count()}')

    mt2 = mt2.checkpoint(
        f'{lustre_dir}/variant_qc/megaWES_final_after_RF_{run_hash}.mt', overwrite=True)
    #Remove gt and entries and samples
    mt1 = mt2.select_entries()
    mt_fin = mt2.filter_cols(mt2['s'] == 'sample')

    chroms=[*range(1,23),"X","Y"]
    chromosomes=["chr"+ str(chr) for chr in chroms]
    for chromosome in chromosomes:
        print(chromosome)
        mt=mt_fin.filter_rows(mt_fin.locus.contig==chromosome)
        mt.write(f'{lustre_dir}/final_matrixtables_VCFs/{chromosome}_after_RF_{run_hash}_NOSAMPLES_GT.mt',overwrite=True)
        hl.export_vcf(
        mt, f'{lustre_dir}/final_matrixtables_VCFs/VCFs/{chromosome}_after_RF_{run_hash}_LOCI_only',parallel='separate_header')
示例#22
0
def compute_from_full_mt(chr20: bool, overwrite: bool):
    mt = get_gnomad_data('exomes', adj=True, release_samples=True)
    freq_ht = hl.read_table(annotations_ht_path('exomes', 'frequencies'))
    vep_ht = hl.read_table(annotations_ht_path('exomes', 'vep'))
    rf_ht = hl.read_table(annotations_ht_path('exomes', 'rf'))

    if chr20:
        mt, freq_ht, vep_ht, rf_ht = filter_to_chr20([mt, freq_ht, vep_ht, rf_ht])

    vep_ht = vep_ht.annotate(
        vep=get_worst_gene_csq_code_expr(vep_ht.vep).values()
    )

    freq_ht = freq_ht.select(
        freq=freq_ht.freq[:10],
        popmax=freq_ht.popmax
    )
    freq_meta = hl.eval(freq_ht.globals.freq_meta)
    freq_dict = {f['pop']: i for i, f in enumerate(freq_meta[:10]) if 'pop' in f}
    freq_dict['all'] = 0
    freq_dict = hl.literal(freq_dict)
    mt = mt.annotate_rows(
        **freq_ht[mt.row_key],
        vep=vep_ht[mt.row_key].vep,
        filters=rf_ht[mt.row_key].filters
    )
    mt = mt.filter_rows(
        (mt.freq[0].AF <= MAX_FREQ) &
        (hl.len(mt.vep) > 0) &
        (hl.len(mt.filters) == 0)
    )

    mt = mt.filter_entries(mt.GT.is_non_ref())
    mt = mt.select_entries(
        is_het=mt.GT.is_het()
    )

    mt = mt.explode_rows(mt.vep)
    mt = mt.transmute_rows(**mt.vep)

    mt = mt.annotate_cols(
        pop=['all', mt.meta.pop]
    )
    mt = mt.explode_cols(mt.pop)

    mt = mt.group_rows_by(
        'gene_id'
    ).aggregate_rows(
        gene_symbol=hl.agg.take(mt.gene_symbol, 1)[0]
    ).aggregate(
        counts=hl.agg.filter(
            hl.if_else(
                mt.pop == 'all',
                hl.is_defined(mt.popmax) & (mt.popmax.AF <= MAX_FREQ),
                mt.freq[freq_dict[mt.pop]].AF <= MAX_FREQ
            ),
            hl.agg.group_by(
                hl.if_else(
                    mt.pop == 'all',
                    mt.popmax.AF > 0.001,
                    mt.freq[freq_dict[mt.pop]].AF > 0.001
                ),
                hl.struct(
                    hom_csq=hl.agg.filter(~mt.is_het, hl.agg.min(mt.csq)),
                    het_csq=hl.agg.filter(mt.is_het, hl.agg.min(mt.csq)),
                    het_het_csq=hl.sorted(
                        hl.array(
                            hl.agg.filter(mt.is_het, hl.agg.counter(mt.csq))
                        ),
                        key=lambda x: x[0]
                    ).scan(
                        lambda i, j: (j[0], i[1] + j[1]),
                        (0, 0)
                    ).find(
                        lambda x: x[1] > 1
                    )[0]
                )
            )
        )
    )

    mt = mt.annotate_entries(
        counts=hl.struct(
            all=hl.struct(
                hom_csq=hl.min(mt.counts.get(True).hom_csq, mt.counts.get(False).hom_csq),
                het_csq=hl.min(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq),
                het_het_csq=hl.min(
                    mt.counts.get(True).het_het_csq,
                    mt.counts.get(False).het_het_csq,
                    hl.or_missing(
                        hl.is_defined(mt.counts.get(True).het_csq) & hl.is_defined(mt.counts.get(False).het_csq),
                        hl.max(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq)
                    )
                ),
            ),
            af_le_0_001=mt.counts.get(False)
        )
    )

    mt = mt.checkpoint('gs://gnomad-tmp/compound_hets/het_and_hom_per_gene{}.1.mt'.format(
        '.chr20' if chr20 else ''
    ), overwrite=True)

    gene_ht = mt.annotate_rows(
        row_counts=hl.flatten([
            hl.array(
                hl.agg.group_by(
                    mt.pop,
                    hl.struct(
                        csq=csq,
                        af=af,
                        n_hom=hl.agg.count_where(mt.counts[af].hom_csq == csq_i),
                        n_het=hl.agg.count_where(mt.counts[af].het_csq == csq_i),
                        n_het_het=hl.agg.count_where(mt.counts[af].het_het_csq == csq_i)
                    )
                )
            ).filter(
                lambda x: (x[1].n_het > 0) | (x[1].n_hom > 0) | (x[1].n_het_het > 0)
            ).map(
                lambda x: x[1].annotate(
                    pop=x[0]
                )
            )
            for csq_i, csq in enumerate(CSQ_CODES)
            for af in ['all', 'af_le_0_001']
        ])
    ).rows()

    gene_ht = gene_ht.explode('row_counts')
    gene_ht = gene_ht.select(
        'gene_symbol',
        **gene_ht.row_counts
    )

    gene_ht.describe()

    gene_ht = gene_ht.checkpoint(
        'gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.ht'.format(
            '.chr20' if chr20 else ''
        ),
        overwrite=overwrite
    )

    gene_ht.flatten().export('gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.tsv.gz'.format(
        '.chr20' if chr20 else ''
    ))
def main(args):
    # Set paths for data access based on command line parameters
    root = './data'
    
    context_ht_path = f'{root}/context/Homo_sapiens_assembly19.fasta.snps_only.vep_20181129.ht'
    processed_genomes_ht_path = f'{root}/model/genomes_processed.ht'
    processed_exomes_ht_path = f'{root}/model/exomes_processed.ht'
    mutation_rate_ht_path = f'{root}/model/mutation_rate_methylation_bins.ht'
    po_coverage_ht_path = f'{root}/model/prop_observed_by_coverage_no_common_pass_filtered_bins.ht'
    po_ht_path = f'{root}/{{subdir}}/prop_observed_{{subdir}}.ht'
    raw_constraint_ht_path = f'{root}/{{subdir}}/constraint_{{subdir}}.ht'
    final_constraint_ht_path = f'{root}/{{subdir}}/constraint_final_{{subdir}}.ht'
    possible_variants_ht_path = f'{root}/model/possible_data/possible_transcript_pop_{args.model}.ht'
    

    po_output_path = po_ht_path.format(subdir=args.model)
    output_path = raw_constraint_ht_path.format(subdir=args.model)
    final_path = final_constraint_ht_path.format(subdir=args.model)

    # Sets method for aggregation, will need to be changed for custom analysis
    MODEL_KEYS = {
        'worst_csq': ['gene'],
        'tx_annotation': ['gene', 'expressed'],
        'standard': ['gene', 'transcript', 'canonical']
    }
    
    if args.test:
        ht = load_or_import_po(po_output_path, args.overwrite)
        run_tests(ht)

    if args.get_proportion_observed:
        # Build a model for methylation-dependent mutation rate and apply it to get proportion of variants observed
        # Also need to incorporate genomes and v3 if possible

        print('Running aggregation of variants by grouping variables')
        # Tables of observed mutations in exomes
        full_exome_ht = prepare_ht(hl.read_table(processed_exomes_ht_path), args.trimers) 

        # filter into X, Y and autosomal regions
        exome_ht = full_exome_ht.filter(full_exome_ht.locus.in_autosome_or_par())
        exome_x_ht = hl.filter_intervals(full_exome_ht, [hl.parse_locus_interval('X')])
        exome_x_ht = exome_x_ht.filter(exome_x_ht.locus.in_x_nonpar())
        exome_y_ht = hl.filter_intervals(full_exome_ht, [hl.parse_locus_interval('Y')])
        exome_y_ht = exome_y_ht.filter(exome_y_ht.locus.in_y_nonpar())

        # Modelling results of estimated mutation rates for genes, coverage, methylation level and base context
        possible_variants_ht = hl.read_table(possible_variants_ht_path)

        # Set chosen groupings to aggregate on
        groupings = ['gene','annotation','modifier']

        # Apply model; aggregate by chosen groupings & get proportion observed; write to file
        po_exome_ht, po_exome_x_ht, po_exome_y_ht = \ 
            [get_proportion_observed(ht, possible_variants_ht, groupings) for ht in (exome_ht, exome_x_ht, exome_y_ht)]
        po_exome_ht.write(po_output_path, overwrite=args.overwrite)
        po_exome_x_ht.write(po_output_path.replace('.ht', '_x.ht'), overwrite=args.overwrite)
        po_exome_y_ht.write(po_output_path.replace('.ht', '_y.ht'), overwrite=args.overwrite)

       
    if args.aggregate:
        print('Running aggregation by gene')
        # read PO hail tables for autosomes, X and Y chromosomes and join them
        print(f'Reading hail table from {po_output_path}')
        # Autosomes
        ht = load_or_import(po_output_path, args.overwrite)
        # X chromosome
        ht_x = load_or_import(po_output_path.replace('.ht','_x.ht'), args.overwrite)
        # Y chromosome
        ht_y = load_or_import(po_output_path.replace('.ht','_y.ht'), args.overwrite)
        # Combine into one table
        ht = ht.union(ht_x).union(ht_y)
        # group by gene/transcript and calculate summary stats
        ht = finalize_dataset(ht, keys=MODEL_KEYS[args.model])
        # write hail table to output path
        ht.write(output_path, args.overwrite)
        hl.read_table(output_path).export(output_path.replace('.ht', '.txt.bgz'))

    if args.summarise:
        print('Finalising summary stats')
        # write summary stats to output path
        ht = load_or_import(output_path, args.overwrite)
        mut_types = ('lof', 'mis', 'syn','mis_pphen','mis_non_pphen')
        output_var_types = zip(('obs', 'exp', 'oe', 'oe', 'oe'),
                                ('', '', '', '_lower', '_upper'))
        output_vars = product(mut_types,output_var_types)
        ht.select(
            'gene','transcript','canonical',
            *[f'{t}_{m}{ci}' for m, (t, ci) in output_vars],
            #*[f'{m}_z' for m in mut_types[:3]], 
            'pLI', 'pRec', 'pNull', 
            #gene_issues=ht.constraint_flag
        ).select_globals().write(final_path, overwrite=args.overwrite)
        hl.read_table(final_path).export(final_path.replace('.ht', '.txt.bgz'))
示例#24
0
def compute_from_vp_mt(chr20: bool, overwrite: bool):
    meta = get_gnomad_meta('exomes')
    vp_mt = hl.read_matrix_table(full_mt_path('exomes'))
    vp_mt = vp_mt.filter_cols(meta[vp_mt.col_key].release)
    ann_ht = hl.read_table(vp_ann_ht_path('exomes'))
    phase_ht = hl.read_table(phased_vp_count_ht_path('exomes'))

    if chr20:
        vp_mt, ann_ht, phase_ht = filter_to_chr20([vp_mt, ann_ht, phase_ht])

    vep1_expr = get_worst_gene_csq_code_expr(ann_ht.vep1)
    vep2_expr = get_worst_gene_csq_code_expr(ann_ht.vep2)
    ann_ht = ann_ht.select(
        'snv1',
        'snv2',
        is_singleton_vp=(ann_ht.freq1['all'].AC < 2) & (ann_ht.freq2['all'].AC < 2),
        pop_af=hl.dict(
            ann_ht.freq1.key_set().intersection(ann_ht.freq2.key_set())
                .map(
                lambda pop: hl.tuple([pop, hl.max(ann_ht.freq1[pop].AF, ann_ht.freq2[pop].AF)])
            )
        ),
        popmax_af=hl.max(ann_ht.popmax1.AF, ann_ht.popmax2.AF, filter_missing=False),
        filtered=(hl.len(ann_ht.filters1) > 0) | (hl.len(ann_ht.filters2) > 0),
        vep=vep1_expr.keys().filter(
            lambda k: vep2_expr.contains(k)
        ).map(
            lambda k: vep1_expr[k].annotate(
                csq=hl.max(vep1_expr[k].csq, vep2_expr[k].csq)
            )
        )
    )

    vp_mt = vp_mt.annotate_cols(
        pop=meta[vp_mt.col_key].pop
    )
    vp_mt = vp_mt.annotate_rows(
        **ann_ht[vp_mt.row_key],
        phase_info=phase_ht[vp_mt.row_key].phase_info
    )

    vp_mt = vp_mt.filter_rows(
        ~vp_mt.filtered
    )

    vp_mt = vp_mt.filter_entries(
        vp_mt.GT1.is_het() & vp_mt.GT2.is_het() & vp_mt.adj1 & vp_mt.adj2
    )

    vp_mt = vp_mt.select_entries(
        x=True
    )

    vp_mt = vp_mt.annotate_cols(
        pop=['all', vp_mt.pop]
    )
    vp_mt = vp_mt.explode_cols('pop')

    vp_mt = vp_mt.explode_rows('vep')
    vp_mt = vp_mt.transmute_rows(
        **vp_mt.vep
    )

    def get_grouped_phase_agg():
        return hl.agg.group_by(
            hl.case()
                .when(~vp_mt.is_singleton_vp & (vp_mt.phase_info[vp_mt.pop].em.adj.p_chet > CHET_THRESHOLD), 1)
                .when(~vp_mt.is_singleton_vp & (vp_mt.phase_info[vp_mt.pop].em.adj.p_chet < SAME_HAP_THRESHOLD), 2)
                .default(3)
            ,
            hl.agg.min(vp_mt.csq)
        )

    vp_mt = vp_mt.group_rows_by(
        'gene_id',
        'gene_symbol'
    ).aggregate(
        all=hl.agg.filter(
            vp_mt.x &
            hl.if_else(
                vp_mt.pop == 'all',
                hl.is_defined(vp_mt.popmax_af) &
                (vp_mt.popmax_af <= MAX_FREQ),
                vp_mt.pop_af[vp_mt.pop] <= MAX_FREQ
            ),
            get_grouped_phase_agg()
        ),
        af_le_0_001=hl.agg.filter(
            hl.if_else(
                vp_mt.pop == 'all',
                hl.is_defined(vp_mt.popmax_af) &
                (vp_mt.popmax_af <= 0.001),
                vp_mt.pop_af[vp_mt.pop] <= 0.001
            )
            & vp_mt.x,
            get_grouped_phase_agg()
        )
    )

    vp_mt = vp_mt.checkpoint('gs://gnomad-tmp/compound_hets/chet_per_gene{}.2.mt'.format(
        '.chr20' if chr20 else ''
    ), overwrite=True)

    gene_ht = vp_mt.annotate_rows(
        row_counts=hl.flatten([
            hl.array(
                hl.agg.group_by(
                    vp_mt.pop,
                    hl.struct(
                        csq=csq,
                        af=af,
                        # TODO: Review this
                        # These will only kept the worst csq -- now maybe it'd be better to keep either
                        # - the worst csq for chet or
                        # - the worst csq for both chet and same_hap
                        n_worst_chet=hl.agg.count_where(vp_mt[af].get(1) == csq_i),
                        n_chet=hl.agg.count_where((vp_mt[af].get(1) == csq_i) & (vp_mt[af].get(2, 9) >= csq_i) & (vp_mt[af].get(3, 9) >= csq_i)),
                        n_same_hap=hl.agg.count_where((vp_mt[af].get(2) == csq_i) & (vp_mt[af].get(1, 9) > csq_i) & (vp_mt[af].get(3, 9) >= csq_i)),
                        n_unphased=hl.agg.count_where((vp_mt[af].get(3) == csq_i) & (vp_mt[af].get(1, 9) > csq_i) & (vp_mt[af].get(2, 9) > csq_i))
                    )
                )
            ).filter(
                lambda x: (x[1].n_chet > 0) | (x[1].n_same_hap > 0) | (x[1].n_unphased > 0)
            ).map(
                lambda x: x[1].annotate(
                    pop=x[0]
                )
            )
            for csq_i, csq in enumerate(CSQ_CODES)
            for af in ['all', 'af_le_0_001']
        ])
    ).rows()

    gene_ht = gene_ht.explode('row_counts')
    gene_ht = gene_ht.select(
        **gene_ht.row_counts
    )

    gene_ht.describe()
    gene_ht = gene_ht.checkpoint(
        'gs://gnomad-lfran/compound_hets/chet_per_gene{}.ht'.format(
            '.chr20' if chr20 else ''
        ),
        overwrite=overwrite
    )

    gene_ht.flatten().export(
        'gs://gnomad-lfran/compound_hets/chet_per_gene{}.tsv.gz'.format(
            '.chr20' if chr20 else ''
        )
    )
示例#25
0
def get_baselevel_expression_for_genes(
        mt,
        gtex,
        gene_list=None,
        get_proportions=None,
        gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path):
    gtex_table = gtex.key_by("transcript_id")

    if gene_list:
        genes = hl.literal(gene_list)

        # Filter context_ht to genes of interest
        mt = mt.annotate_rows(in_gene_of_interest=genes.find(
            lambda x: mt.vep.transcript_consequences.any(lambda tc: tc.
                                                         gene_symbol == x)))
        mt = mt.filter_rows(mt.in_gene_of_interest != "NA")

    # Need to modify process consequences to ignore splice variants, because these can occur on intronic regions

    all_coding_minus_splice = list(
        set(all_coding_csqs) - set([
            'splice_acceptor_variant', 'splice_donor_variant',
            'splice_region_variant'
        ]))

    def add_most_severe_consequence_to_consequence_minus_splice(
            tc: hl.expr.StructExpression) -> hl.expr.StructExpression:
        """
        Copied from gnomad_hail but slight change
        """

        csqs = hl.literal(all_coding_minus_splice)
        return tc.annotate(most_severe_consequence=csqs.find(
            lambda c: tc.consequence_terms.contains(c)))

    # Add worst consequence within transcript consequences
    mt = (mt.annotate_rows(vep=mt.vep.annotate(
        transcript_consequences=mt.vep.transcript_consequences.map(
            add_most_severe_consequence_to_consequence_minus_splice))))

    # Explode on transcript consequences
    mt = mt.explode_rows(mt.vep.transcript_consequences)
    mt_kt = mt.rows()

    # Filter to positions in the CDS regions
    cds_intervals = hl.import_bed(
        "gs://gnomad-public/papers/2019-tx-annotation/data/other_data/gencode.v19.CDS.Hail.021519.bed"
    )
    mt_kt = mt_kt.annotate(in_cds=hl.is_defined(cds_intervals[mt_kt.locus]))
    mt_kt = mt_kt.filter(mt_kt.in_cds)

    # Filter to protein coding transcripts only
    mt_kt = mt_kt.filter(
        mt_kt.vep.transcript_consequences.biotype == "protein_coding")

    # Filter to coding variants to only evalute those effects
    mt_kt = filter_table_to_csqs(mt_kt, all_coding_minus_splice)

    # To avoid double counting transcripts at a given base, key by transcript and position and dedup
    mt_kt = mt_kt.key_by(mt_kt.locus,
                         mt_kt.vep.transcript_consequences.transcript_id)
    mt_kt = mt_kt.distinct()

    # Annotate mt with the gtex values (ie. join them)
    mt_kt = mt_kt.annotate(
        tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id])

    ## Group by gene, symbol and position
    ht_sum_of_bases = mt_kt.group_by(
        locus=mt_kt.locus,
        ensg=mt_kt.vep.transcript_consequences.gene_id,
        symbol=mt_kt.vep.transcript_consequences.gene_symbol).aggregate(
            sum_per_base=hl.agg.array_sum(mt_kt.tx_data.agg_expression))

    tissue_ids = sorted([
        y.tissue.replace("-", "_").replace(" ",
                                           "_").replace("(",
                                                        "_").replace(")", "_")
        for y in gtex.values.take(1)[0]
    ])
    d = {tiss: i for i, tiss in enumerate(tissue_ids)}

    ht_sum_of_bases = ht_sum_of_bases.annotate(**{
        tissue: ht_sum_of_bases.sum_per_base[d[tissue]]
        for tissue in tissue_ids
    })

    if get_proportions:
        gene_maximums_ht = hl.read_table(gene_maximums_ht_path)
        ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus)
        ht_sum_of_bases = ht_sum_of_bases.annotate(alleles="filler")
        ht_sum_of_bases = get_expression_proportion(
            tx_table=ht_sum_of_bases,
            tissues_to_filter=["sum_per_base"],
            gene_maximum_ht=gene_maximums_ht)
        ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus)
        ht_sum_of_bases = ht_sum_of_bases.drop(ht_sum_of_bases.alleles)

    return ht_sum_of_bases
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')

    cohorts_pop = hl.import_table(
        "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels.tsv",
        delimiter="\t").key_by('s')

    pca_scores = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/elgh_labels/pop_assignments_test.ht")
    # pca_loadings = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_loadings.ht")
    logger.info("assign population pcs")
    # population_assignment_table = assign_population_pcs(
    #    pca_scores, pca_loadings, known_col="known_pop")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.pca_scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(
        f"{tmp_dir}/ddd-elgh-ukbb/pop_assignments_test_minprob_0.5.ht",
        overwrite=True)
    pop_ht.export(
示例#27
0
 def read_clump_ht(f):
     ht = hl.read_table(f)
     ht = ht.drop('idx')
     return ht
示例#28
0
                                        'locus':
                                        hl.tlocus(reference_genome='GRCh38'),
                                        'alleles':
                                        hl.tarray(hl.tstr)
                                    })
ht_final_variants = ht_final_variants.key_by(ht_final_variants.locus,
                                             ht_final_variants.alleles)

ht_final_pruned_variants = hl.import_table(FINAL_PRUNED_VARIANTS,
                                           no_header=True)
ht_final_pruned_variants = ht_final_pruned_variants.annotate(
    **hl.parse_variant(ht_final_pruned_variants.f0, reference_genome='GRCh38'))
ht_final_pruned_variants = ht_final_pruned_variants.key_by(
    ht_final_pruned_variants.locus, ht_final_pruned_variants.alleles)

sample_annotations = hl.read_table(PHENOTYPES_TABLE)
impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE)
annotation_annotations = hl.read_table(ANNOTATION_TABLE)

mt = hl.read_matrix_table(MT)
mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split')

mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key]))

mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key])
mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key])
mt = mt.annotate_rows(annotation=annotation_annotations[mt.row_key])

mt = hl.variant_qc(mt, name='qc')
示例#29
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from genome-wide association study
    (GWAS) summary statistics.

    Given a set or multiple sets of GWAS summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = ld_score_all_phenos_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = ld_score_one_pheno_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = ld_score_one_pheno_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies (GWAS).
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs))
            or (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus)
                            & hl.is_defined(ds.__alleles)
                            & hl.is_defined(ds.__w_initial)
                            & hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus)
                            & hl.is_defined(ds.__alleles)
                            & hl.is_defined(ds.__w_initial)
                            & hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y)
                                         & (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)

    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0]
                                                 + mt.__step1_betas[1]
                                                 * mt.__x_floor) ** 2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg(
        lambda i: hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                                hl.agg.linreg(y=mt.__y,
                                              x=[1.0, mt.__x],
                                              weight=mt.__w).beta),
        hl.range(n_blocks)))

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected))
                       - hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)) ** 2
                       / n_blocks)
            / (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0 / (mt.__w_initial_floor
                   * 2.0 * (mt.__step2_betas[0] +
                            + mt.__step2_betas[1]
                            * mt.__x_floor) ** 2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n) / M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg(
        lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                                hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                              x=[mt.__x],
                                              weight=mt.__w).beta[0]),
        hl.range(n_blocks)))

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected ** 2)
            - hl.sum(mt.__step2_block_betas_bias_corrected) ** 2
            / n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0 / (mt.__w_initial_floor
                                 * 2.0 * (mt.__initial_betas[0] +
                                          + mt.__initial_betas[1]
                                          * mt.__x_floor) ** 2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x)
             / hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c
                   * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1]
                                            - (n_blocks - 1)
                                            * mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected ** 2)
             - hl.sum(mt.__final_block_betas_bias_corrected) ** 2
             / n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M / hl.agg.mean(mt.__n)) ** 2
                                   * mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)

    return ht
示例#30
0
p.add_argument("--index-type", help="Elasticsearch index type", required=True)
p.add_argument("--num-shards",
               help="Number of elasticsearch shards",
               default=1,
               type=int)
p.add_argument("--es-block-size",
               help="Elasticsearch block size to use when exporting",
               default=200,
               type=int)
args = p.parse_args()

hl.init(log="/tmp/hail.log")

print("\n=== Importing Hail table ===")

ds = hl.read_table(args.ht_url)

print("\n=== Exporting to Elasticsearch ===")

es = ElasticsearchClient(args.host, args.port)
es.export_table_to_elasticsearch(
    ds,
    index_name=args.index_name,
    index_type_name=args.index_type,
    block_size=args.es_block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    export_globals_to_index_meta=True,
    verbose=True,
)
示例#31
0
文件: misc.py 项目: tpoterba/hail
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Examples
    --------
    Run PC-relate and compute pairs of closely related individuals:

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain:

    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(
    ...     hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain, preferring to keep cases over controls:

    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.key_by(
    ...        s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False)

    Notes
    -----

    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    Parameters
    ----------
    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.
    keyed : :obj:`bool`
        If ``True``, key the resulting table by the `node` field, this requires
        a sort.

    Returns
    -------
    :class:`.Table`
        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.
    """

    if i.dtype != j.dtype:
        raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. "
                         "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format(
            "expression of '{}'".format(
                source.__class__) if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_variable('l', wrapped_node_t)
        r = construct_variable('r', wrapped_node_t)
        tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_str = str(tie_breaker_expr._ir)
    else:
        t, _ = source._process_joins(i, j)
        tie_breaker_str = None

    edges = t.select(__i=i, __j=j).key_by().select('__i', '__j')
    edges_path = new_temp_file()
    edges.write(edges_path)
    edges = hl.read_table(edges_path)

    mis_nodes = construct_expr(JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet(
        Env.spark_backend('maximal_independent_set')._to_java_ir(edges.collect(_localize=False)._ir),
        node_t._parsable_string(),
        joption(tie_breaker_str))),
                               hl.tset(node_t))

    nodes = edges.select(node = [edges.__i, edges.__j])
    nodes = nodes.explode(nodes.node)
    nodes = nodes.annotate_globals(mis_nodes=mis_nodes)
    nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep)
    nodes = nodes.select_globals()
    if keyed:
        return nodes.key_by('node')
    return nodes
def main(args):
    ########################################################################
    ### initialize
    phenos = ['crc', 't2d', 'glaucoma', 'afib', 'ra']
    renamed = {
        's': 's',
        'CRC': 'crc',
        'T2D': 't2d',
        'Glaucoma': 'glaucoma',
        'AFib': 'afib',
        'RA': 'ra'
    }
    phenotype = 'ALL5cc'
    sumstats_text_file = args.dirname + args.basename + 'ALL5cc.clumped'
    prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '-pt-sumstats-locus-allele-keyed.kt'
    contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype

    contigs = {'0{}'.format(x): str(x) for x in range(1, 10)}

    bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'

    start = time.time()
    # large block size because we read very little data (due to filtering & ignoring genotypes)
    # hl.init(branching_factor=10, min_block_size=2000)
    hl.init()

    ################################################################################
    ### set up the sumstats table (chr, bp for union SNPs)
    if (args.generate_prs_loci_table):
        t = hl.import_table(sumstats_text_file, delimiter='\s+', impute=True)
        t = t.select(locus=hl.locus(hl.str(t.CHR), t.BP))
        t = t.key_by('locus')
        t.write(prs_loci_table_location, overwrite=True)

    ss = hl.read_table(prs_loci_table_location)

    ################################################################################
    ### Get true phenotypes from UKBB
    if args.pheno_table:
        phenotypes = hl.import_table(
            'gs://mkanai/disparities/ukb31063.phecode_5diseases.both_sexes.tsv.bgz',
            key='s',
            impute=True,
            types={'s': hl.tstr})
        phenotypes = phenotypes.rename(renamed)

        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            key='s',
            impute=True,
            types={'s': hl.tstr})

        samples = covariates.annotate(**phenotypes[covariates.s])

        # Write pheno/covar/sample info table
        for pheno in phenos:
            gwas_holdout = hl.import_table(
                'gs://mkanai/disparities/ukbb/pheno_31063_holdout_gwas_' +
                pheno + '.info.txt.gz',
                delimiter='\s+').key_by('s')

            samples = samples.annotate(
                **{
                    pheno + '_holdout':
                    gwas_holdout[samples.s].gwas_holdout == 'holdout'
                })

        samples.write(
            'gs://mkanai/disparities/pheno_31063_holdout_gwas_cc_phenos.ht',
            args.overwrite)

    if args.ss_tables:
        # Write ss info
        for pheno in phenos:
            print(pheno)
            ss = hl.import_table(args.dirname + args.basename + pheno +
                                 '.*.bgz',
                                 delimiter='\s+',
                                 impute=True,
                                 types={
                                     'beta': hl.tfloat,
                                     'pval': hl.tfloat,
                                     'pos': hl.tint,
                                     'nCompleteSamples': hl.tint,
                                     'AC': hl.tfloat,
                                     'ytx': hl.tfloat,
                                     'se': hl.tfloat,
                                     'tstat': hl.tfloat
                                 })
            ss = ss.key_by(locus=hl.locus(hl.str(ss.chr), hl.int(
                ss.pos))).repartition(200)

            ss.write(args.dirname + args.basename + pheno + '.ht', True)

    ################################################################################
    ### Run the PRS using phenotype-specific clump variants
    if args.write_bgen:
        mt_all = hl.import_bgen(
            bgen_files,
            entry_fields=['dosage'],
            sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
            variants=ss.locus)

        samples = hl.read_table(
            'gs://mkanai/disparities/pheno_31063_holdout_gwas_cc_phenos.ht')
        mt_all = mt_all.annotate_cols(**samples[
            mt_all.s])  # ok that phenos keyed on userId not s?

        mt_all.repartition(5000, shuffle=False).write(
            args.dirname + args.basename + 'ALL5cc.mt', args.overwrite)

    mt_all = hl.read_matrix_table(args.dirname + args.basename + 'ALL5cc.mt')

    for pheno in phenos:  #[6:len(phenos)]:
        print(pheno)
        ss = hl.read_table(args.dirname + args.basename + pheno + '.ht')
        """
        To add:
        - Filter only to samples in holdout GWAS
        - Filter to rows in phenotype-specific clump file
        - Build PRS for 10 p-value thresholds
        - Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
        """
        # filter to only samples held out from GWAS
        mt = mt_all.filter_cols(mt_all[pheno + '_holdout'])

        mt = mt.annotate_rows(ss=ss[mt.locus])
        mt = annotate_beta(mt, mt.ss)

        # p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1}
        p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2}

        pheno_clump = specific_clumps(args.dirname + args.basename + pheno +
                                      '.clumped')

        mt = mt.filter_rows(hl.is_defined(pheno_clump[mt.locus]))
        print(mt.count())

        annot_expr = {
            k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.pval < v))
            for k, v in p_max.items()
        }

        mt = mt.annotate_cols(**annot_expr)

        mt.cols().write(args.dirname + 'UKB_' + pheno + '_PRS.ht',
                        stage_locally=True,
                        overwrite=True)
        ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_PRS.ht')
        ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x],
                         *[x for x in phenos if pheno not in x])

        output_location = args.dirname + 'UKB_' + pheno + '_PRS.txt.bgz'
        ht_out.export(output_location)
    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
示例#33
0
 def test_write_stage_locally(self):
     t = hl.utils.range_table(5)
     f = new_temp_file(suffix='ht')
     t.write(f, stage_locally=True)
     t2 = hl.read_table(f)
     self.assertTrue(t._same(t2))
示例#34
0
def annotate_transcript_consequences(variants_path,
                                     transcripts_path,
                                     mane_transcripts_path=None):
    ds = hl.read_table(variants_path)

    most_severe_consequence = ds.vep.most_severe_consequence

    transcript_consequences = ds.vep.transcript_consequences

    # Drop irrelevant consequences
    transcript_consequences = transcript_consequences.map(
        lambda c: c.annotate(consequence_terms=c.consequence_terms.filter(
            lambda t: ~OMIT_CONSEQUENCE_TERMS.contains(t)))).filter(
                lambda c: c.consequence_terms.size() > 0)

    # Add/transmute derived fields
    transcript_consequences = transcript_consequences.map(
        lambda c: c.annotate(major_consequence=hl.sorted(
            c.consequence_terms, key=consequence_term_rank)[0])
    ).map(lambda c: c.annotate(
        domains=c.domains.map(lambda domain: domain.db + ":" + domain.name),
        hgvsc=c.hgvsc.split(":")[-1],
        hgvsp=hgvsp_from_consequence_amino_acids(c),
        is_canonical=hl.bool(c.canonical),
    ))

    transcript_consequences = transcript_consequences.map(lambda c: c.select(
        "biotype",
        "consequence_terms",
        "domains",
        "gene_id",
        "gene_symbol",
        "hgvsc",
        "hgvsp",
        "is_canonical",
        "lof_filter",
        "lof_flags",
        "lof",
        "major_consequence",
        "polyphen_prediction",
        "sift_prediction",
        "transcript_id",
    ))

    transcripts = hl.read_table(transcripts_path)

    transcript_info = hl.dict([
        (row.transcript_id, row.transcript_info)
        for row in transcripts.select(transcript_info=hl.struct(
            transcript_version=transcripts.transcript_version,
            gene_version=transcripts.gene.gene_version,
        )).collect()
    ])

    transcript_consequences = transcript_consequences.map(
        lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id)))

    if mane_transcripts_path:
        mane_transcripts = hl.read_table(mane_transcripts_path)

        mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id"))
                                    for row in mane_transcripts.collect()])

        transcript_consequences = transcript_consequences.map(
            lambda csq: csq.annotate(**hl.rbind(
                mane_transcripts.get(csq.gene_id),
                lambda mane_transcript: (hl.case().when(
                    (mane_transcript.ensembl_id == csq.transcript_id)
                    & (mane_transcript.ensembl_version == csq.
                       transcript_version),
                    hl.struct(
                        is_mane_select=True,
                        is_mane_select_version=True,
                        refseq_id=mane_transcript.refseq_id,
                        refseq_version=mane_transcript.refseq_version,
                    ),
                ).when(
                    mane_transcript.ensembl_id == csq.transcript_id,
                    hl.struct(
                        is_mane_select=True,
                        is_mane_select_version=False,
                        refseq_id=hl.null(hl.tstr),
                        refseq_version=hl.null(hl.tstr),
                    ),
                ).default(
                    hl.struct(
                        is_mane_select=False,
                        is_mane_select_version=False,
                        refseq_id=hl.null(hl.tstr),
                        refseq_version=hl.null(hl.tstr),
                    ))),
            )))

        transcript_consequences = hl.sorted(
            transcript_consequences,
            lambda c: (
                hl.if_else(
                    c.biotype == "protein_coding", 0, 1, missing_false=True),
                hl.if_else(c.major_consequence == most_severe_consequence,
                           0,
                           1,
                           missing_false=True),
                hl.if_else(c.is_mane_select, 0, 1, missing_false=True),
                hl.if_else(c.is_canonical, 0, 1, missing_false=True),
            ),
        )

    else:
        transcript_consequences = hl.sorted(
            transcript_consequences,
            lambda c: (
                hl.if_else(
                    c.biotype == "protein_coding", 0, 1, missing_false=True),
                hl.if_else(c.major_consequence == most_severe_consequence,
                           0,
                           1,
                           missing_false=True),
                hl.if_else(c.is_canonical, 0, 1, missing_false=True),
            ),
        )

    ds = ds.annotate(
        transcript_consequences=transcript_consequences).drop("vep")

    return ds
示例#35
0
 def test_write_stage_locally(self):
     t = hl.utils.range_table(5)
     f = new_temp_file(suffix='ht')
     t.write(f, stage_locally=True)
     t2 = hl.read_table(f)
     self.assertTrue(t._same(t2))
def main(args):
    ########################################################################
    ### initialize
    which_beta = 'beta' + args.which_beta
    if args.method == 'metal':
        end_dir = 'metal/'
        clumps = args.dirname + end_dir + 'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_beta_' + args.which_beta + '_' + args.iter + '.metal.clumped'
        ss_filename = args.dirname + end_dir + 'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_beta_' + args.which_beta + '_' + args.iter + '.tsv'
        out_base = args.dirname + end_dir + which_beta + '_draw_' + args.iter + '_spike_' + args.which_beta + '_metal_PRS'
    elif args.method == 'mama':
        ld = args.ld + '/'
        analysis = args.analysis + '/'
        end_dir = 'mama/ld_true/'
        clumps = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2.clumped'
        ss_filename = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2.txt'
        # this ss_filename has different headers

        out_base = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2_PRS'
    else:
        end_dir = 'ukbb_only/'
        clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped'
        ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz'
        out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS'

    clump_table_location = args.dirname + 'keytables/ukb-' + args.basename + '-pt-sumstats-locus-allele-keyed.kt'

    contigs = {'0{}'.format(x): str(x) for x in range(1, 10)}

    bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr22_v3.bgen'

    start = time.time()
    # large block size because we read very little data (due to filtering & ignoring genotypes)
    hl.init(branching_factor=10, min_block_size=2000)
    # set min_block_size only in import_bgen

    ################################################################################
    ### set up the sumstats table (chr, bp for union SNPs)
    if args.read_clumps:
        clump_file = hl.import_table(clumps, delimiter='\s+', impute=True)
        clump_file = clump_file.select(
            locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP))
        clump_file = clump_file.key_by('locus')
        clump_file.write(clump_table_location, overwrite=True)

    clump_file = hl.read_table(clump_table_location)

    ################################################################################
    ### Write ss info, process so sumstats are uniform across MAMA, METAL, and gwas
    if args.ss_tables:
        #ss = hl.import_table(args.dirname + args.basename + '.tsv.gz',
        ss = hl.import_table(
            ss_filename,
            #'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.which_beta + 'beta_01_9.tsv'
            #  # for mama case
            delimiter='\s+',
            impute=True,
            types={'BP': hl.tint})
        if args.method != 'mama' and args.method != 'metal':
            ss = ss.rename({
                'chr': 'CHR',
                'pos': 'BP',
                'rsid': 'SNP',
                'ref': 'A1',
                'alt': 'A2',
                'maf': 'FRQ',
                'p_value_beta_' + args.which_beta: 'MAMA_PVAL',
                'standard_error_beta_' + args.which_beta: 'MAMA_SE',
                'beta_beta_' + args.which_beta: 'MAMA_BETA'
            })
        ss = ss.key_by(
            locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200)
        ss = ss.annotate(A1=ss.A1.upper(), A2=ss.A2.upper())

        ss.write(args.dirname + args.basename + '_sep.ht', True)

    ss = hl.read_table(args.dirname + args.basename + '_sep.ht')

    ################################################################################
    ### Run the PRS using phenotype-specific clump variants
    if args.write_bgen:
        mt_all = hl.import_bgen(
            bgen_files, ['dosage'],
            sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
            variants=clump_file.locus)

        samples = hl.import_table(args.dirname +
                                  'ukb_not_in_simulation_rand5000.inds',
                                  types={
                                      's': hl.tstr
                                  }).key_by('s')
        mt = mt_all.filter_cols(hl.is_defined(samples[mt_all.s]))

        mt.repartition(5000, shuffle=False).write(
            args.dirname + args.basename + '.mt', True)

    mt = hl.read_matrix_table(args.dirname + args.basename + '.mt')
    true_ss = hl.read_table(args.dirname +
                            'BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht')
    """
    To add:
    - Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
    """
    # filter to only samples held out from GWAS
    mt = mt.annotate_rows(ss=ss[mt.locus])
    mt = annotate_beta(mt, mt.ss)

    p_max = {
        's1': 5e-8,
        's2': 1e-6,
        's3': 1e-4,
        's4': 1e-3,
        's5': 1e-2,
        's6': .05,
        's7': .1,
        's8': .2,
        's9': .5,
        's10': 1
    }

    pheno_clump = specific_clumps(clumps)

    mt = mt.filter_rows(pheno_clump.get(mt.locus, False))
    print(mt.count())

    # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL)
    # sqrt(2pq)
    if args.betas_are_standardized:
        annot_expr = {
            k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) *
                                            (1 - hl.float(mt.ss.FRQ))) *
                          mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
            for k, v in p_max.items()
        }
    else:
        annot_expr = {
            k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
            for k, v in p_max.items()
        }

    mt = mt.annotate_cols(**annot_expr, **true_ss[mt.s])

    mt.key_cols_by().cols().write(out_base + '.ht',
                                  stage_locally=True,
                                  overwrite=True)
    ht = hl.read_table(out_base + '.ht')

    output_location = out_base + '.txt.bgz'
    ht.export(output_location)
    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
示例#37
0
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')
    cohorts_pop = hl.import_table(
        "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels_updated.tsv", delimiter="\t").key_by('s')
    # Read mt
    mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/chr1_chr20_ldpruned_updated.mt")
    # pca_scores_pop
    pca_scores_pop = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020.ht")

    ''' # pca_scores_superpop
    pca_scores_superpop = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020_superpops.ht")

    # annotate mt with pop and superpop
    mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop)
    mt = mt.annotate_cols(assigned_superpop=pca_scores_superpop[mt.s].pop)

    # do sample_qc
    # calculate and annotate with metric heterozygosity
    mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc')

    mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate(
        heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called))
示例#38
0
    ht_mfi['chrom'],
    hl.str(ht_mfi['position']), ht_mfi['allele1_ref'], ht_mfi['allele2_alt']
]),
                                            delimiter=':'))

# prep to merge with GWAS variant list
ht_mfi = ht_mfi.key_by('variant')
ht_mfi = ht_mfi.annotate(maf=hl.float(ht_mfi.maf), info=hl.float(ht_mfi.info))
ht_mfi = ht_mfi.select('varid', 'rsid', 'maf', 'info')

#######
# load GWAS variant list
#######

# get GWAS variant list
ht_sites = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht')
ht_sites = ht_sites.annotate(
    variant=hl.variant_str(ht_sites.locus, ht_sites.alleles))
ht_sites = ht_sites.key_by('variant')

########
# merge and save
########

# get final merged file with maf/info of the gwas variants
ht = ht_mfi.join(ht_sites, how='inner')
ht = ht.select('locus', 'alleles', 'varid', 'rsid', 'maf', 'info')
print(ht.count())

# save both ht and tsv
ht.write('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.ht',
示例#39
0
文件: datasets.py 项目: bcajes/hail
def load_dataset(name,
                 version,
                 reference_genome,
                 config_file='gs://hail-datasets/datasets.json'):
    """Load a genetic dataset from Hail's repository.

    Example
    -------

    >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates
    >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes',   # doctest: +SKIP
    ...                                       version='phase3',
    ...                                       reference_genome='GRCh38')

    Parameters
    ----------
    name : :obj:`str`
        Name of the dataset to load.
    version : :obj:`str`
        Version of the named dataset to load
        (see available versions in documentation).
    reference_genome : `GRCh37` or `GRCh38`
        Reference genome build.

    Returns
    -------
    :class:`.Table` or :class:`.MatrixTable`"""

    with hl.hadoop_open(config_file, 'r') as f:
        datasets = json.load(f)

    names = set([dataset['name'] for dataset in datasets])
    if name not in names:
        raise ValueError('{} is not a dataset available in the repository.'.format(repr(name)))

    versions = set([dataset['version'] for dataset in datasets if dataset['name']==name])
    if version not in versions:
        raise ValueError("""Version {0} not available for dataset {1}.
                            Available versions: {{{2}}}.""".format(repr(version), 
                                                                   repr(name),
                                                                   repr('","'.join(versions))))

    reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name])
    if reference_genome not in reference_genomes:
        raise ValueError("""Reference genome build {0} not available for dataset {1}.
                            Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome),
                                                                                    repr(name), 
                                                                                    '\',\''.join((reference_genomes))))

    path = [dataset['path'] for dataset in datasets if all([dataset['name']==name,
                                                            dataset['version']==version,
                                                            dataset['reference_genome']==reference_genome])][0].strip('/')

    if path.endswith('.ht'):
        dataset = hl.read_table(path)
    else:
        if not path.endswith('.mt'):
            raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path)))
        dataset = hl.read_matrix_table(path)

    return dataset
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # save relatedness estimates for pc_relate global populations
    ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_global = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_global_matrix.csv', 'analysis')
    pc_relate_global.to_csv(filename, index=False)

    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)

    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_global_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for pc_relate NFE samples
    ht = hl.read_table(PC_RELATE_ESTIMATE_NFE)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_nfe = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis')
    pc_relate_nfe.to_csv(filename, index=False)
    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for KING NFE samples
    mt = hl.read_matrix_table(KING_ESTIMATE_NFE)
    ht = mt.entries()
    # remove entries where samples are identical
    related_samples = ht.filter(ht.s_1 != ht.s)
    related_samples = ht.filter(ht.phi > 0.1)
    king_nfe = pd.DataFrame({
        'i_s': related_samples.s_1.collect(),
        'j_s': related_samples.s.collect(),
        'kin': related_samples.phi.collect(),
    })
    filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis')
    king_nfe.to_csv(filename, index=False)
    # save KING NFE maximal independent set
    second_degree_related_samples = ht.filter(
        (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=second_degree_related_samples.s_1,
                       j=second_degree_related_samples.s)
    struct = struct.annotate(phi=second_degree_related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i,
        struct.j,
        False  # pylint: disable=E1101
    )
    related_samples = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()})
    filename = output_path(
        f'king_90k_related_samples_maximal_independent_set.csv', 'analysis')
    related_samples.to_csv(filename, index=False)