def test_read_stored_globals(self): ds = self.get_vds() ds = ds.annotate_globals(x=5, baz='foo') f = new_temp_file(suffix='vds') ds.write(f) t = hl.read_table(f + '/globals') self.assertTrue(ds.globals_table()._same(t))
def test_backward_compatability(self): import os all_values_table, all_values_matrix_table = create_all_values_datasets() table_dir = resource('backward_compatability/1.0.0/table') matrix_table_dir = resource('backward_compatability/1.0.0/matrix_table') n = 0 i = 0 f = os.path.join(table_dir, '{}.ht'.format(i)) while os.path.exists(f): ds = hl.read_table(f) self.assertTrue(ds._same(all_values_table)) i += 1 f = os.path.join(table_dir, '{}.ht'.format(i)) n += 1 i = 0 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) while os.path.exists(f): ds = hl.read_matrix_table(f) self.assertTrue(ds._same(all_values_matrix_table)) i += 1 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) n += 1 self.assertEqual(n, 8)
def test_codecs_table(self): from hail.utils.java import scala_object codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs() rt = self.get_vds().rows() temp = new_temp_file(suffix='ht') for codec in codecs: rt.write(temp, overwrite=True, _codec_spec=codec.toString()) rt2 = hl.read_table(temp) self.assertTrue(rt._same(rt2))
def test_fix3307_read_mt_wrong(self): mt = hl.import_vcf(resource('sample2.vcf')) mt = hl.split_multi_hts(mt) mt.write('/tmp/foo.mt', overwrite=True) mt2 = hl.read_matrix_table('/tmp/foo.mt') t = hl.read_table('/tmp/foo.mt/rows') self.assertTrue(mt.rows()._same(t)) self.assertTrue(mt2.rows()._same(t)) self.assertTrue(mt._same(mt2))
def test_large_number_of_fields(tmpdir): mt = hl.utils.range_table(100) mt = mt.annotate(**{ str(k): k for k in range(1000) }) f = tmpdir.join("foo.mt") assert_time(lambda: mt.count(), 5) assert_time(lambda: mt.write(str(f)), 5) mt = assert_time(lambda: hl.read_table(str(f)), 5) assert_time(lambda: mt.count(), 5)
def read_expression(path): """Read an :class:`Expression` written with :meth:`.experimental.write_expression`. Example ------- >>> hl.experimental.write_expression(hl.array([1, 2]), 'output/test_expression.he') >>> expression = hl.experimental.read_expression('output/test_expression.he') >>> hl.eval(expression) Parameters ---------- path : :obj:`str` File to read. Returns ------- :class:`Expression` """ return hl.read_table(path).index_globals().expr
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError( "'maximal_independent_set' expects an expression of 'Table'. Found {}" .format("expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_variable('l', wrapped_node_t) r = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = Env.hail().utils.Graph.maximalIndependentSet( edges._jt.collect(), node_t._parsable_string(), joption(tie_breaker_str)) nodes = edges.select(node=[edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) # avoid serializing `mis_nodes` from java to python and back to java nodes = Table._from_java( nodes._jt.annotateGlobal(mis_nodes, hl.tset(node_t)._parsable_string(), 'mis_nodes')) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node') return nodes
def prepare_variant_results(table_urls): annotations = None analysis_groups = [] for annotations_table_url, results_table_url in table_urls: group_annotations = hl.import_table( annotations_table_url, force=True, key="v", missing="NA", types={ "v": hl.tstr, "in_analysis": hl.tbool, "gene_id": hl.tstr, "gene_name": hl.tstr, "transcript_id": hl.tstr, "hgvsc": hl.tstr, "hgvsp": hl.tstr, "csq_analysis": hl.tstr, "csq_worst": hl.tstr, "mpc": hl.tfloat, "polyphen": hl.tstr, }, ) group_results = hl.import_table( results_table_url, force=True, key="v", missing="NA", types={ "v": hl.tstr, "analysis_group": hl.tstr, "ac_case": hl.tint, "an_case": hl.tstr, "af_case": hl.tstr, "ac_ctrl": hl.tint, "an_ctrl": hl.tstr, "af_ctrl": hl.tstr, }, ) groups_in_table = group_results.aggregate( hl.agg.collect_as_set(group_results.analysis_group)) assert len(groups_in_table) == 1, groups_in_table group_name = groups_in_table.pop() analysis_groups.append(group_name) group_results = group_results.annotate( an_case=hl.int(group_results.an_case), af_case=hl.float(group_results.af_case), an_ctrl=hl.int(group_results.an_ctrl), af_ctrl=hl.float(group_results.af_ctrl), in_analysis=group_annotations[group_results.v].in_analysis, ) group_results.drop("analysis_group").write(f"temp_{group_name}.ht") group_annotations = group_annotations.drop("in_analysis") if annotations is None: annotations = group_annotations else: annotations = annotations.union(group_annotations) annotations = annotations.distinct() annotations = annotations.annotate( filters="PASS", csq_analysis=hl.sorted(annotations.csq_analysis.split(","), lambda c: consequence_term_rank(c))[0], csq_worst=hl.sorted(annotations.csq_worst.split(","), lambda c: consequence_term_rank(c))[0], canonical_transcript_id=annotations.transcript_id, hgvsc_canonical=annotations.hgvsc, hgvsp_canonical=annotations.hgvsp, ) annotations = annotations.annotate( locus=hl.locus( annotations.v.split(":")[0], hl.int(annotations.v.split(":")[1])), alleles=annotations.v.split(":")[2:4], ) annotations = annotations.annotate( variant_id=variant_id(annotations.locus, annotations.alleles), chrom=annotations.locus.contig, pos=annotations.locus.position, xpos=x_position(annotations.locus), alt=annotations.alleles[1], ref=annotations.alleles[0], ) annotations = annotations.drop("locus", "alleles") annotations = annotations.annotate(groups=hl.struct()) for group_name in analysis_groups: results = hl.read_table(f"temp_{group_name}.ht") annotations = annotations.annotate(groups=annotations.groups.annotate( **{group_name: results[annotations.key]})) annotations = annotations.key_by().drop("v") return annotations
def tx_annotate_mt(mt, gtex, tx_annotation_type, tissues_to_filter=v7_tissues_to_drop, gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path, filter_to_csqs=all_coding_csqs, filter_to_genes=None, gene_column_in_mt=None, filter_to_homs=False, out_tx_annotation_tsv=None, out_tx_annotation_ht=None): """ Annotate variants in the input MatrixTable with transcript-based expression values accross GTEx. Returns Table. :param MatrixTable mt: Input variant file :param MatrixTable gtex: Input GTEx summary MatrixTable, must have transcript_id column to key by :param str tx_annotation_type: One of ["expression", "proportion"]. Select proportion if you'd like the tx_annotation values to be normalized by max expression of the gene :param None or list filter_to_csqs: Default None. If you'd like to filter the mt before annotating (decreases time) feed in a list or set of consequence terms. :param str gene_column_in_mt: Must be set if filter_to_genes != None. Column in matrix table that contains gene information within vep.transcript_consequences. often ["gene_id", "gene_symbol"] :param None or list filter_to_csqs: Default None. If you'd like to filter the mt before annotating (decreases time) feed in a list or set of consequence terms. Example = ["stop_gained","splice_donor_variant", "splice_acceptor_variant","frameshift_variant"] :param None or str out_tx_annotation_tsv: Default None. If you'd like to write out the results table as a tsv, provide a tsv path :param None or str out_tx_annotation_ht: Default None. If you'd like to write out the results table as a Hail 0.2 table, provide a .ht path :param bool filter_to_homs: Default False If True, filter to variants with at least one homozygote in dataset :return: Table with columns: variant, worst_csq, ensg, LOFTEE LOF, LOFTEE LOF Flag, transcript-aware expression by GTEx Tissue :rtype: Table with variants annotated with transcript-aware tissue expression """ #check_inputs(**locals()) gtex_table = gtex.key_by("transcript_id") #mt = process_consequences(mt, penalize_flags=False) mt_exploded = mt.distinct_by_row() mt_exploded = mt_exploded.annotate_rows(vep=mt_exploded.vep.annotate( transcript_consequences=mt_exploded.vep.transcript_consequences.map( add_most_severe_consequence_to_consequence))) # Explode the mt for the transcript consequences to be able to key by transcript ID mt_exploded = mt_exploded.explode_rows( mt_exploded.vep.transcript_consequences) mt_kt = mt_exploded.rows() # Currently testing removal of protein coding transcripts mt_kt = mt_kt.filter( mt_kt.vep.transcript_consequences.biotype == "protein_coding") if filter_to_genes: print("Filtering to genes of interest") mt_kt = filter_table_to_gene_list(mt_kt, filter_to_genes, gene_column_in_mt) if filter_to_csqs: print("Filtering to csqs in %s" % (",".join(filter_to_csqs))) mt_kt = filter_table_to_csqs(mt_kt, filter_to_csqs) if filter_to_homs: print( "Filtering to variants with at least 1 homozygote sample in dataset" ) #mt_kt = mt_kt.filter(mt_kt.info.Hom[mt_kt.a_index - 1] > 0) idx = mt_kt.globals.freq_index_dict['gnomad'] mt_kt = mt_kt.filter(mt_kt.freq[idx].homozygote_count >= 1) # Annotate mt with the gtex values (ie. join them) mt_kt = mt_kt.annotate( tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id]) # Group by gene, worst_csq and variant, and do a pairwise-sum grouped_table = (mt_kt.group_by( csq=mt_kt.vep.transcript_consequences.most_severe_consequence, ensg=mt_kt.vep.transcript_consequences.gene_id, symbol=mt_kt.vep.transcript_consequences.gene_symbol, locus=mt_kt.locus, alleles=mt_kt.alleles, lof=mt_kt.vep.transcript_consequences.lof, lof_flag=mt_kt.vep.transcript_consequences.lof_flags).aggregate( tx_annotation=hl.agg.array_sum(mt_kt.tx_data.agg_expression))) # Expand the columns from the arrays and add tissues as headers #tissue_ids = gtex.tissue.collect() # Since gtex no longer has .tissue just a new way to do this, i probably want to save it as a global at some point tissue_ids = sorted([y.tissue for y in gtex.values.take(1)[0]]) d = {tiss: i for i, tiss in enumerate(tissue_ids)} tx_annotation_table = grouped_table.annotate( **{ tissue_id.replace("-", "_").replace(" ", "_").replace("(", "_"). replace(")", "_"): grouped_table.tx_annotation[d[tissue_id]] for tissue_id in tissue_ids }) tx_annotation_table = tx_annotation_table.drop( tx_annotation_table.tx_annotation) # First of all do you want proportions or expression? if tx_annotation_type == "proportion": print("Returning expression proportion") gene_maximums_ht = hl.read_table(gene_maximums_ht_path) tx_annotation_table = get_expression_proportion( tx_annotation_table, tissues_to_filter, gene_maximums_ht) #You can write the output that is exploded by variants-ensg-csq-symbol-LOFTEE-LOFTEEflag # and has a value for each tissue as column, either as a TSV or a KT if out_tx_annotation_tsv: print("Writing tsv file to %s" % out_tx_annotation_tsv) tx_annotation_table.export(out_tx_annotation_tsv) if out_tx_annotation_ht: print("Writing Table to %s" % out_tx_annotation_ht) tx_annotation_table.write(out_tx_annotation_ht) tx_annotation_table = tx_annotation_table.key_by( tx_annotation_table.locus, tx_annotation_table.alleles) tx_annotation_table = tx_annotation_table.collect_by_key('tx_annotation') mt = mt.annotate_rows(**tx_annotation_table[mt.locus, mt.alleles]) return mt
def prepare_mitochondrial_variants(path, mnvs_path=None): ds = hl.read_table(path) haplogroups = hl.eval(ds.globals.hap_order) ds = ds.annotate(hl_hist=ds.hl_hist.annotate( bin_edges=ds.hl_hist.bin_edges.map( lambda n: hl.float(hl.format("%.2f", n))))) filter_names = hl.dict({ "artifact_prone_site": "Artifact-prone site", "indel_stack": "Indel stack", "npg": "No passing genotype" }) ds = ds.select( # ID variant_id=variant_id(ds.locus, ds.alleles), reference_genome=ds.locus.dtype.reference_genome.name, chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, # Quality filters=ds.filters.map(lambda f: filter_names.get(f, f)), qual=ds.qual, genotype_quality_metrics=[ hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all) ], genotype_quality_filters=[ hl.struct( name="Base Quality", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.base_qual_hist), ), hl.struct( name="Contamination", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.contamination_hist), ), hl.struct( name="Heteroplasmy below 10%", filtered=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.heteroplasmy_below_10_percent_hist), ), hl.struct(name="Position", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.position_hist)), hl.struct( name="Strand Bias", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.strand_bias_hist), ), hl.struct( name="Weak Evidence", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.weak_evidence_hist), ), ], site_quality_metrics=[ hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)), hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)), hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)), ], # Frequency an=ds.AN, ac_hom=ds.AC_hom, ac_het=ds.AC_het, excluded_ac=ds.excluded_AC, # Heteroplasmy common_low_heteroplasmy=ds.common_low_heteroplasmy, heteroplasmy_distribution=ds.hl_hist, max_heteroplasmy=ds.max_hl, # Haplogroups hapmax_af_hom=ds.hapmax_AF_hom, hapmax_af_het=ds.hapmax_AF_het, faf_hapmax_hom=ds.faf_hapmax_hom, haplogroup_defining=ds.hap_defining_variant, haplogroups=[ hl.struct( id=haplogroup, an=ds.hap_AN[i], ac_het=ds.hap_AC_het[i], ac_hom=ds.hap_AC_hom[i], faf_hom=ds.hap_faf_hom[i], heteroplasmy_distribution=ds.hap_hl_hist[i], ) for i, haplogroup in enumerate(haplogroups) ], # Other age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom), flags=hl.set([ hl.or_missing(ds.common_low_heteroplasmy, "common_low_heteroplasmy") ]).filter(hl.is_defined), mitotip_score=ds.mitotip_score, mitotip_trna_prediction=ds.mitotip_trna_prediction, pon_ml_probability_of_pathogenicity=ds. pon_ml_probability_of_pathogenicity, pon_mt_trna_prediction=ds.pon_mt_trna_prediction, variant_collapsed=ds.variant_collapsed, vep=ds.vep, ) if mnvs_path: mnvs = hl.import_table(mnvs_path, types={ "pos": hl.tint, "ref": hl.tstr, "alt": hl.tstr, "AC_hom_MNV": hl.tint }) mnvs = mnvs.key_by( locus=hl.locus("chrM", mnvs.pos, reference_genome=ds.locus.dtype.reference_genome), alleles=[mnvs.ref, mnvs.alt], ) ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0)) ds = ds.annotate( flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags)) return ds
# path for Julia's sample metadata file jul_metadata_path = ( 'gs://hgdp_tgp/output/gnomad_v3.1_sample_qc_metadata_hgdp_tgp_subset.ht') # path for variant qc info var_metadata_path = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht' # path for Konrad's densified matrix table dense_mt_path = 'gs://hgdp_tgp/output/tgp_hgdp.mt' # reading in Alicia's sample metadata file (Note: this file uses the 'v3.1::' prefix as done in gnomAD) sample_meta = hl.import_table(sample_metadata_path, impute=True) # reading in Julia's sample metadata file jul_meta = hl.read_table(jul_metadata_path) # reading in variant qc information var_meta = hl.read_table(var_metadata_path) # reading in densified matrix table dense_mt = hl.read_matrix_table(dense_mt_path) # These bits below were written by Tim Poterba to help troubleshoot unflattening a ht with nested structure # dict to hold struct names as well as nested field names d = {} # Getting just the row field names row = sample_meta.row_value # returns a dict with the struct names as keys and their inner field names as values
def compatible_checkpoint(obj, path): obj.write(path, overwrite=True) return hl.read_table(path)
f'Invalid sex argument "{sex}" - must be one of {{"both_sexes", "female", "male"}}.' ) if contig not in set(['autosomes', 'chrX', 'chrXY']): raise ValueError( f'Invalid contig argument "{contig}" - must be one of {{"autosomes", "chrX", "chrXY"}}.' ) try: dilution = sys.argv[4] except: dilution = False else: dilution = True ht_phenotypes = hl.read_table( f'gs://ukb31063-mega-gwas/biomarkers/pipelines/ukb31063.biomarkers_gwas.{sex}.pipeline_{pipeline}.ht' ) ht_covariates = hl.read_table( f'gs://ukb31063/hail/ukb31063.neale_gwas_covariates.{sex}.ht') ht_variants = hl.read_table( 'gs://ukb31063/hail/ukb31063.neale_gwas_variants.ht') if dilution: ht = hl.read_table(f'gs://ukb31063/hail/ukb31063.biomarkers_gwas.{sex}.ht') ht = ht.select('estimated_sample_dilution_factor_raw') ht_covariates = ht_covariates.annotate(estimated_sample_dilution_factor=ht[ ht_covariates.s]['estimated_sample_dilution_factor_raw']) if contig == 'autosomes': contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}' else:
def load_dataset(name, version, reference_genome, config_file='gs://hail-datasets/datasets.json'): """Load a genetic dataset from Hail's repository. Example ------- >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes', # doctest: +SKIP ... version='phase3', ... reference_genome='GRCh38') Parameters ---------- name : :obj:`str` Name of the dataset to load. version : :obj:`str` Version of the named dataset to load (see available versions in documentation). reference_genome : `GRCh37` or `GRCh38` Reference genome build. Returns ------- :class:`.Table` or :class:`.MatrixTable`""" with hl.hadoop_open(config_file, 'r') as f: datasets = json.load(f) names = set([dataset['name'] for dataset in datasets]) if name not in names: raise ValueError( '{} is not a dataset available in the repository.'.format( repr(name))) versions = set([ dataset['version'] for dataset in datasets if dataset['name'] == name ]) if version not in versions: raise ValueError("""Version {0} not available for dataset {1}. Available versions: {{{2}}}.""".format( repr(version), repr(name), repr('","'.join(versions)))) reference_genomes = set([ dataset['reference_genome'] for dataset in datasets if dataset['name'] == name ]) if reference_genome not in reference_genomes: raise ValueError( """Reference genome build {0} not available for dataset {1}. Available reference genome builds: {{'{2}'}}.""". format(repr(reference_genome), repr(name), '\',\''.join( (reference_genomes)))) path = [ dataset['path'] for dataset in datasets if all([ dataset['name'] == name, dataset['version'] == version, dataset['reference_genome'] == reference_genome ]) ][0].strip('/') if path.endswith('.ht'): dataset = hl.read_table(path) else: if not path.endswith('.mt'): raise ValueError( 'Invalid path {}: can only load datasets with .ht or .mt extensions.' .format(repr(path))) dataset = hl.read_matrix_table(path) return dataset
# need to create spark cluster first before intiialising hail sc = pyspark.SparkContext() # Define the hail persistent storage directory tmp_dir = "hdfs://spark-master:9820/" temp_dir = "file:///home/ubuntu/data/tmp" plot_dir = "/home/ubuntu/data/tmp" hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) n_partitions = 500 omni = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_omni2.5.hg38.ht' omni_ht = hl.read_table(omni) mills = f'{temp_dir}/ddd-elgh-ukbb/training_sets/Mills_and_1000G_gold_standard.indels.hg38.ht' mills_ht = hl.read_table(mills) thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht' thousand_genomes_ht = hl.read_table(thousand_genomes) hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht' hapmap_ht = hl.read_table(hapmap) # ANNOTATION TABLES: truth_data_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht') trio_stats_table = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht') #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht') allele_data_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_allele_data.ht') allele_counts_ht = hl.read_table(
def get_overlapping_phenos(apcdr_ukb, gwas_phenos, gwas_biomarkers, pheno_table, overwrite): # get which phenotypes exist in apcdr data pheno_gwas = hl.import_table(apcdr_ukb) pheno_gwas = { row['pheno_code']: row['ukb_code'] for row in pheno_gwas.collect() } incl_whr = False if 'WHR' in pheno_gwas: del pheno_gwas['WHR'] incl_whr = True if 'EA' in pheno_gwas: educ = hl.import_table( 'gs://ukb-diverse-pops/Phenotypes/Everyone/PHESANT_final_output/January_2020_plus_pharma_and_updated_codings/phesant_output_multi_ancestry_combined_both_sexes_no_sex_specific_educ_att_years.tsv', missing='', impute=True, min_partitions=100, key='userId', types={ 'userId': hl.tstr, 'EDUC_ATT_CAT_ORD': hl.tint }) # read ukb data ht_phenotypes = hl.import_table(gwas_phenos, force_bgz=True, missing='', impute=True, min_partitions=100, types={'s': hl.tstr}, key='s') phenotype_cols = set(ht_phenotypes.row) irnt = [] raw_phenos = [] biomarker = [] for pheno in pheno_gwas.values(): if pheno + '_irnt' in phenotype_cols: irnt.append(pheno) elif pheno in phenotype_cols: raw_phenos.append(pheno) elif pheno + '_0' in phenotype_cols: raw_phenos.append(pheno + '_0') else: biomarker.append(pheno) print(pheno_gwas.values()) if incl_whr: irnt.append('whr') ht_phenotypes = ht_phenotypes.annotate(whr=ht_phenotypes['48_raw'] / ht_phenotypes['49_raw']) ht_phenotypes = irnt_funct(ht_phenotypes.whr, 'whr_irnt').key_by('s') if 'EA' in pheno_gwas: ht_phenotypes = ht_phenotypes.annotate( ea=educ[ht_phenotypes.key].EDUC_ATT_CAT_ORD) raw_phenos.append('ea') print('adding EA') # now select phenotypes that are in apcdr data ht_phenos = ht_phenotypes.select(*[x + '_irnt' for x in irnt] + raw_phenos) ht_phenos.show() # filter biomarkers to codes biomarkers = [ 'cholesterol_irnt', 'hdl_cholesterol_irnt', 'ldl_irnt', 'triglycerides_irnt', 'albumin_irnt', 'alkaline_phosphatase_irnt', 'alanine_aminotransferase_irnt', 'aspartate_aminotransferase_irnt', 'direct_bilirubin_irnt', 'gamma_glutamyltransferase_irnt', 'glycated_haemoglobin_irnt' ] if gwas_biomarkers: ht_biomarkers = hl.read_table(gwas_biomarkers).select(*biomarkers) # now join biomarkers with phenotypes ht_all_phenos = ht_phenos.join(ht_biomarkers) ht_all_phenos.write(pheno_table, overwrite=args.overwrite) else: ht_phenos.write(pheno_table, overwrite=args.overwrite)
def main(args): # get phenotypes that overlap with APCDR dataset if args.write_phenos: get_overlapping_phenos(args.pheno_ukb_codes, args.gwas_phenos, args.gwas_biomarkers, args.pheno_table, args.overwrite) ht_phenos = hl.read_table(args.pheno_table) ht_covariates = hl.read_table(args.gwas_covariates) ht_variants = hl.read_table(args.gwas_variants) ht_samples = hl.import_table(args.gwas_samples, types={'s': hl.tstr}, key='s') contig = 'autosomes' contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}' # import ukb bgen files mt = hl.import_bgen( path= f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen', sample_file=f'gs://ukb31063/ukb31063.{contig}.sample', entry_fields=['dosage'], variants=ht_variants) # add phenotype and covariate info mt = mt.annotate_cols(phenotypes=ht_phenos[mt.s], covariates=ht_covariates[mt.s]) # filter keeping samples in gwas, i.e. exclude holdout samples # mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'TRUE') disc_or_target = 'target' if disc_or_target == 'target': mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'FALSE') # target else: mt = mt.filter_cols(ht_samples[mt.s].in_gwas == 'TRUE') # discovery # mt.write(args.mt, overwrite=args.overwrite) # # mt = hl.read_matrix_table(args.mt) phenotypes = list(mt['phenotypes'].keys()) pheno1 = phenotypes[0:10] pheno2 = phenotypes[10:20] pheno3 = phenotypes[20:30] pheno4 = phenotypes[30:len(phenotypes)] pheno_leftover = ['whr_irnt', 'glycated_haemoglobin_irnt'] # run_grouped_regressions(mt, args.holdout_ss_output, pheno1, 'pheno1') # run_grouped_regressions(mt, args.holdout_ss_output, pheno2, 'pheno2') # run_grouped_regressions(mt, args.holdout_ss_output, pheno3, 'pheno3') # run_grouped_regressions(mt, args.holdout_ss_output, pheno4, 'pheno4') # run_grouped_regressions(mt, args.holdout_ss_output, pheno_leftover, 'pheno_leftover') if disc_or_target == 'target': run_grouped_regressions(mt, args.holdout_ss_output, phenotypes, 'target_holdout2') else: run_grouped_regressions(mt, args.holdout_ss_output, phenotypes, 'discovery')
import argparse import hail as hl p = argparse.ArgumentParser() p.add_argument("--input-url", required=True) p.add_argument("--genes-url", required=True) p.add_argument("--output-url", required=True) args = p.parse_args() hl.init(log="/tmp/hail.log") ds = hl.read_table(args.input_url) ds = ds.annotate(analysis_group="meta") genes = hl.read_table(args.genes_url) genes = genes.key_by("gene_id") ds = ds.annotate(chrom=genes[ds.gene_id].chrom, pos=genes[ds.gene_id].start) ds.write(args.output_url)
import hail as hl ht_samples = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_samples.ht') ht_relationships = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_sample_relationships.ht') mt = hl.import_vcf( 'gs://hail-datasets-raw-data/1000_Genomes/1000_Genomes_phase3_chrY_GRCh37.vcf.bgz', reference_genome='GRCh37') mt = mt.annotate_cols(**ht_samples[mt.s]) mt = mt.annotate_cols(**ht_relationships[mt.s]) mt_split = hl.split_multi(mt) mt_split = mt_split.select_entries( GT=hl.downcode(mt_split.GT, mt_split.a_index)) mt_split = mt_split.annotate_rows(info=hl.struct( DP=mt_split.info.DP, END=mt_split.info.END, SVTYPE=mt_split.info.SVTYPE, AA=mt_split.info.AA, AC=mt_split.info.AC[mt_split.a_index - 1], AF=mt_split.info.AF[mt_split.a_index - 1], NS=mt_split.info.NS, AN=mt_split.info.AN, EAS_AF=mt_split.info.EAS_AF[mt_split.a_index - 1], EUR_AF=mt_split.info.EUR_AF[mt_split.a_index - 1], AFR_AF=mt_split.info.AFR_AF[mt_split.a_index - 1], AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1], SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1],
def main(): print("main") run_hash = "91ba5f38" ht=hl.read_table(f'{lustre_dir}/variant_qc/models/{run_hash}_score_binning.ht') mt = hl.read_matrix_table( f'{lustre_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt') table_cohort = hl.import_table( f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", delimiter="\t").key_by('s') mt = mt.annotate_cols(cohort=table_cohort[mt.s].cohort) df = pd.read_csv( f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", sep="\t") cohorts_array = df.cohort.unique() mt = mt.annotate_rows( MAF_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AF)) ) mt = mt.annotate_rows( AN_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AN)) ) mt = mt.annotate_rows( AC_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AC)) ) mt = mt.annotate_rows( missingness_cohorts=hl.agg.group_by(mt.cohort, hl.min( (hl.agg.count_where(hl.is_missing(mt['GT']))) / mt.count_rows()*2)) ) mt = mt.annotate_rows( info=mt.info.annotate(cohort_names=mt.MAF_cohorts.keys()) ) mt = mt.annotate_rows( info=mt.info.annotate(MAF_cohorts_values=mt.MAF_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate(AN_cohorts_values=mt.AN_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate(AC_cohorts=mt.AC_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate( missingness_cohorts_values=mt.missingness_cohorts.values()) ) mt = mt.annotate_rows( Variant_Type=hl.cond((hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP", hl.cond( hl.is_insertion( mt.alleles[0], mt.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt.alleles[0], mt.alleles[1]), "INDEL", "Other")))) mt = mt.annotate_rows( info=mt.info.annotate( rf_probability=ht[mt.row_key].rf_probability['TP']) ) mt = mt.annotate_rows( info=mt.info.annotate(score=ht[mt.row_key].score) ) mt = mt.annotate_rows( info=mt.info.annotate(bin=ht[mt.row_key].bin) ) filter_column_annotation = ( hl.case() .when(((mt.Variant_Type == "SNP") & (mt.info.bin <= SNV_PASS_BIN)), "PASS") .when(((mt.Variant_Type == "INDEL") & (mt.info.bin <= INDEL_PASS_BIN)), "PASS") .default(".") # not pass for rest ) # mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation) mt1 = mt.annotate_rows( filtercol=((filter_column_annotation)) ) mt_fail = mt1.filter_rows(mt1.filtercol == ".") print(mt_fail.count()) mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol)) mt_fail2 = mt2.filter_rows(mt2.filters.contains(".")) mt_pass = mt2.filter_rows(mt2.filters.contains("PASS")) print(f'Failed:{mt_fail2.count()}') print(f'Passed:{mt_pass.count()}') mt2 = mt2.checkpoint( f'{lustre_dir}/variant_qc/megaWES_final_after_RF_{run_hash}.mt', overwrite=True) #Remove gt and entries and samples mt1 = mt2.select_entries() mt_fin = mt2.filter_cols(mt2['s'] == 'sample') chroms=[*range(1,23),"X","Y"] chromosomes=["chr"+ str(chr) for chr in chroms] for chromosome in chromosomes: print(chromosome) mt=mt_fin.filter_rows(mt_fin.locus.contig==chromosome) mt.write(f'{lustre_dir}/final_matrixtables_VCFs/{chromosome}_after_RF_{run_hash}_NOSAMPLES_GT.mt',overwrite=True) hl.export_vcf( mt, f'{lustre_dir}/final_matrixtables_VCFs/VCFs/{chromosome}_after_RF_{run_hash}_LOCI_only',parallel='separate_header')
def compute_from_full_mt(chr20: bool, overwrite: bool): mt = get_gnomad_data('exomes', adj=True, release_samples=True) freq_ht = hl.read_table(annotations_ht_path('exomes', 'frequencies')) vep_ht = hl.read_table(annotations_ht_path('exomes', 'vep')) rf_ht = hl.read_table(annotations_ht_path('exomes', 'rf')) if chr20: mt, freq_ht, vep_ht, rf_ht = filter_to_chr20([mt, freq_ht, vep_ht, rf_ht]) vep_ht = vep_ht.annotate( vep=get_worst_gene_csq_code_expr(vep_ht.vep).values() ) freq_ht = freq_ht.select( freq=freq_ht.freq[:10], popmax=freq_ht.popmax ) freq_meta = hl.eval(freq_ht.globals.freq_meta) freq_dict = {f['pop']: i for i, f in enumerate(freq_meta[:10]) if 'pop' in f} freq_dict['all'] = 0 freq_dict = hl.literal(freq_dict) mt = mt.annotate_rows( **freq_ht[mt.row_key], vep=vep_ht[mt.row_key].vep, filters=rf_ht[mt.row_key].filters ) mt = mt.filter_rows( (mt.freq[0].AF <= MAX_FREQ) & (hl.len(mt.vep) > 0) & (hl.len(mt.filters) == 0) ) mt = mt.filter_entries(mt.GT.is_non_ref()) mt = mt.select_entries( is_het=mt.GT.is_het() ) mt = mt.explode_rows(mt.vep) mt = mt.transmute_rows(**mt.vep) mt = mt.annotate_cols( pop=['all', mt.meta.pop] ) mt = mt.explode_cols(mt.pop) mt = mt.group_rows_by( 'gene_id' ).aggregate_rows( gene_symbol=hl.agg.take(mt.gene_symbol, 1)[0] ).aggregate( counts=hl.agg.filter( hl.if_else( mt.pop == 'all', hl.is_defined(mt.popmax) & (mt.popmax.AF <= MAX_FREQ), mt.freq[freq_dict[mt.pop]].AF <= MAX_FREQ ), hl.agg.group_by( hl.if_else( mt.pop == 'all', mt.popmax.AF > 0.001, mt.freq[freq_dict[mt.pop]].AF > 0.001 ), hl.struct( hom_csq=hl.agg.filter(~mt.is_het, hl.agg.min(mt.csq)), het_csq=hl.agg.filter(mt.is_het, hl.agg.min(mt.csq)), het_het_csq=hl.sorted( hl.array( hl.agg.filter(mt.is_het, hl.agg.counter(mt.csq)) ), key=lambda x: x[0] ).scan( lambda i, j: (j[0], i[1] + j[1]), (0, 0) ).find( lambda x: x[1] > 1 )[0] ) ) ) ) mt = mt.annotate_entries( counts=hl.struct( all=hl.struct( hom_csq=hl.min(mt.counts.get(True).hom_csq, mt.counts.get(False).hom_csq), het_csq=hl.min(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq), het_het_csq=hl.min( mt.counts.get(True).het_het_csq, mt.counts.get(False).het_het_csq, hl.or_missing( hl.is_defined(mt.counts.get(True).het_csq) & hl.is_defined(mt.counts.get(False).het_csq), hl.max(mt.counts.get(True).het_csq, mt.counts.get(False).het_csq) ) ), ), af_le_0_001=mt.counts.get(False) ) ) mt = mt.checkpoint('gs://gnomad-tmp/compound_hets/het_and_hom_per_gene{}.1.mt'.format( '.chr20' if chr20 else '' ), overwrite=True) gene_ht = mt.annotate_rows( row_counts=hl.flatten([ hl.array( hl.agg.group_by( mt.pop, hl.struct( csq=csq, af=af, n_hom=hl.agg.count_where(mt.counts[af].hom_csq == csq_i), n_het=hl.agg.count_where(mt.counts[af].het_csq == csq_i), n_het_het=hl.agg.count_where(mt.counts[af].het_het_csq == csq_i) ) ) ).filter( lambda x: (x[1].n_het > 0) | (x[1].n_hom > 0) | (x[1].n_het_het > 0) ).map( lambda x: x[1].annotate( pop=x[0] ) ) for csq_i, csq in enumerate(CSQ_CODES) for af in ['all', 'af_le_0_001'] ]) ).rows() gene_ht = gene_ht.explode('row_counts') gene_ht = gene_ht.select( 'gene_symbol', **gene_ht.row_counts ) gene_ht.describe() gene_ht = gene_ht.checkpoint( 'gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.ht'.format( '.chr20' if chr20 else '' ), overwrite=overwrite ) gene_ht.flatten().export('gs://gnomad-lfran/compound_hets/het_and_hom_per_gene{}.tsv.gz'.format( '.chr20' if chr20 else '' ))
def main(args): # Set paths for data access based on command line parameters root = './data' context_ht_path = f'{root}/context/Homo_sapiens_assembly19.fasta.snps_only.vep_20181129.ht' processed_genomes_ht_path = f'{root}/model/genomes_processed.ht' processed_exomes_ht_path = f'{root}/model/exomes_processed.ht' mutation_rate_ht_path = f'{root}/model/mutation_rate_methylation_bins.ht' po_coverage_ht_path = f'{root}/model/prop_observed_by_coverage_no_common_pass_filtered_bins.ht' po_ht_path = f'{root}/{{subdir}}/prop_observed_{{subdir}}.ht' raw_constraint_ht_path = f'{root}/{{subdir}}/constraint_{{subdir}}.ht' final_constraint_ht_path = f'{root}/{{subdir}}/constraint_final_{{subdir}}.ht' possible_variants_ht_path = f'{root}/model/possible_data/possible_transcript_pop_{args.model}.ht' po_output_path = po_ht_path.format(subdir=args.model) output_path = raw_constraint_ht_path.format(subdir=args.model) final_path = final_constraint_ht_path.format(subdir=args.model) # Sets method for aggregation, will need to be changed for custom analysis MODEL_KEYS = { 'worst_csq': ['gene'], 'tx_annotation': ['gene', 'expressed'], 'standard': ['gene', 'transcript', 'canonical'] } if args.test: ht = load_or_import_po(po_output_path, args.overwrite) run_tests(ht) if args.get_proportion_observed: # Build a model for methylation-dependent mutation rate and apply it to get proportion of variants observed # Also need to incorporate genomes and v3 if possible print('Running aggregation of variants by grouping variables') # Tables of observed mutations in exomes full_exome_ht = prepare_ht(hl.read_table(processed_exomes_ht_path), args.trimers) # filter into X, Y and autosomal regions exome_ht = full_exome_ht.filter(full_exome_ht.locus.in_autosome_or_par()) exome_x_ht = hl.filter_intervals(full_exome_ht, [hl.parse_locus_interval('X')]) exome_x_ht = exome_x_ht.filter(exome_x_ht.locus.in_x_nonpar()) exome_y_ht = hl.filter_intervals(full_exome_ht, [hl.parse_locus_interval('Y')]) exome_y_ht = exome_y_ht.filter(exome_y_ht.locus.in_y_nonpar()) # Modelling results of estimated mutation rates for genes, coverage, methylation level and base context possible_variants_ht = hl.read_table(possible_variants_ht_path) # Set chosen groupings to aggregate on groupings = ['gene','annotation','modifier'] # Apply model; aggregate by chosen groupings & get proportion observed; write to file po_exome_ht, po_exome_x_ht, po_exome_y_ht = \ [get_proportion_observed(ht, possible_variants_ht, groupings) for ht in (exome_ht, exome_x_ht, exome_y_ht)] po_exome_ht.write(po_output_path, overwrite=args.overwrite) po_exome_x_ht.write(po_output_path.replace('.ht', '_x.ht'), overwrite=args.overwrite) po_exome_y_ht.write(po_output_path.replace('.ht', '_y.ht'), overwrite=args.overwrite) if args.aggregate: print('Running aggregation by gene') # read PO hail tables for autosomes, X and Y chromosomes and join them print(f'Reading hail table from {po_output_path}') # Autosomes ht = load_or_import(po_output_path, args.overwrite) # X chromosome ht_x = load_or_import(po_output_path.replace('.ht','_x.ht'), args.overwrite) # Y chromosome ht_y = load_or_import(po_output_path.replace('.ht','_y.ht'), args.overwrite) # Combine into one table ht = ht.union(ht_x).union(ht_y) # group by gene/transcript and calculate summary stats ht = finalize_dataset(ht, keys=MODEL_KEYS[args.model]) # write hail table to output path ht.write(output_path, args.overwrite) hl.read_table(output_path).export(output_path.replace('.ht', '.txt.bgz')) if args.summarise: print('Finalising summary stats') # write summary stats to output path ht = load_or_import(output_path, args.overwrite) mut_types = ('lof', 'mis', 'syn','mis_pphen','mis_non_pphen') output_var_types = zip(('obs', 'exp', 'oe', 'oe', 'oe'), ('', '', '', '_lower', '_upper')) output_vars = product(mut_types,output_var_types) ht.select( 'gene','transcript','canonical', *[f'{t}_{m}{ci}' for m, (t, ci) in output_vars], #*[f'{m}_z' for m in mut_types[:3]], 'pLI', 'pRec', 'pNull', #gene_issues=ht.constraint_flag ).select_globals().write(final_path, overwrite=args.overwrite) hl.read_table(final_path).export(final_path.replace('.ht', '.txt.bgz'))
def compute_from_vp_mt(chr20: bool, overwrite: bool): meta = get_gnomad_meta('exomes') vp_mt = hl.read_matrix_table(full_mt_path('exomes')) vp_mt = vp_mt.filter_cols(meta[vp_mt.col_key].release) ann_ht = hl.read_table(vp_ann_ht_path('exomes')) phase_ht = hl.read_table(phased_vp_count_ht_path('exomes')) if chr20: vp_mt, ann_ht, phase_ht = filter_to_chr20([vp_mt, ann_ht, phase_ht]) vep1_expr = get_worst_gene_csq_code_expr(ann_ht.vep1) vep2_expr = get_worst_gene_csq_code_expr(ann_ht.vep2) ann_ht = ann_ht.select( 'snv1', 'snv2', is_singleton_vp=(ann_ht.freq1['all'].AC < 2) & (ann_ht.freq2['all'].AC < 2), pop_af=hl.dict( ann_ht.freq1.key_set().intersection(ann_ht.freq2.key_set()) .map( lambda pop: hl.tuple([pop, hl.max(ann_ht.freq1[pop].AF, ann_ht.freq2[pop].AF)]) ) ), popmax_af=hl.max(ann_ht.popmax1.AF, ann_ht.popmax2.AF, filter_missing=False), filtered=(hl.len(ann_ht.filters1) > 0) | (hl.len(ann_ht.filters2) > 0), vep=vep1_expr.keys().filter( lambda k: vep2_expr.contains(k) ).map( lambda k: vep1_expr[k].annotate( csq=hl.max(vep1_expr[k].csq, vep2_expr[k].csq) ) ) ) vp_mt = vp_mt.annotate_cols( pop=meta[vp_mt.col_key].pop ) vp_mt = vp_mt.annotate_rows( **ann_ht[vp_mt.row_key], phase_info=phase_ht[vp_mt.row_key].phase_info ) vp_mt = vp_mt.filter_rows( ~vp_mt.filtered ) vp_mt = vp_mt.filter_entries( vp_mt.GT1.is_het() & vp_mt.GT2.is_het() & vp_mt.adj1 & vp_mt.adj2 ) vp_mt = vp_mt.select_entries( x=True ) vp_mt = vp_mt.annotate_cols( pop=['all', vp_mt.pop] ) vp_mt = vp_mt.explode_cols('pop') vp_mt = vp_mt.explode_rows('vep') vp_mt = vp_mt.transmute_rows( **vp_mt.vep ) def get_grouped_phase_agg(): return hl.agg.group_by( hl.case() .when(~vp_mt.is_singleton_vp & (vp_mt.phase_info[vp_mt.pop].em.adj.p_chet > CHET_THRESHOLD), 1) .when(~vp_mt.is_singleton_vp & (vp_mt.phase_info[vp_mt.pop].em.adj.p_chet < SAME_HAP_THRESHOLD), 2) .default(3) , hl.agg.min(vp_mt.csq) ) vp_mt = vp_mt.group_rows_by( 'gene_id', 'gene_symbol' ).aggregate( all=hl.agg.filter( vp_mt.x & hl.if_else( vp_mt.pop == 'all', hl.is_defined(vp_mt.popmax_af) & (vp_mt.popmax_af <= MAX_FREQ), vp_mt.pop_af[vp_mt.pop] <= MAX_FREQ ), get_grouped_phase_agg() ), af_le_0_001=hl.agg.filter( hl.if_else( vp_mt.pop == 'all', hl.is_defined(vp_mt.popmax_af) & (vp_mt.popmax_af <= 0.001), vp_mt.pop_af[vp_mt.pop] <= 0.001 ) & vp_mt.x, get_grouped_phase_agg() ) ) vp_mt = vp_mt.checkpoint('gs://gnomad-tmp/compound_hets/chet_per_gene{}.2.mt'.format( '.chr20' if chr20 else '' ), overwrite=True) gene_ht = vp_mt.annotate_rows( row_counts=hl.flatten([ hl.array( hl.agg.group_by( vp_mt.pop, hl.struct( csq=csq, af=af, # TODO: Review this # These will only kept the worst csq -- now maybe it'd be better to keep either # - the worst csq for chet or # - the worst csq for both chet and same_hap n_worst_chet=hl.agg.count_where(vp_mt[af].get(1) == csq_i), n_chet=hl.agg.count_where((vp_mt[af].get(1) == csq_i) & (vp_mt[af].get(2, 9) >= csq_i) & (vp_mt[af].get(3, 9) >= csq_i)), n_same_hap=hl.agg.count_where((vp_mt[af].get(2) == csq_i) & (vp_mt[af].get(1, 9) > csq_i) & (vp_mt[af].get(3, 9) >= csq_i)), n_unphased=hl.agg.count_where((vp_mt[af].get(3) == csq_i) & (vp_mt[af].get(1, 9) > csq_i) & (vp_mt[af].get(2, 9) > csq_i)) ) ) ).filter( lambda x: (x[1].n_chet > 0) | (x[1].n_same_hap > 0) | (x[1].n_unphased > 0) ).map( lambda x: x[1].annotate( pop=x[0] ) ) for csq_i, csq in enumerate(CSQ_CODES) for af in ['all', 'af_le_0_001'] ]) ).rows() gene_ht = gene_ht.explode('row_counts') gene_ht = gene_ht.select( **gene_ht.row_counts ) gene_ht.describe() gene_ht = gene_ht.checkpoint( 'gs://gnomad-lfran/compound_hets/chet_per_gene{}.ht'.format( '.chr20' if chr20 else '' ), overwrite=overwrite ) gene_ht.flatten().export( 'gs://gnomad-lfran/compound_hets/chet_per_gene{}.tsv.gz'.format( '.chr20' if chr20 else '' ) )
def get_baselevel_expression_for_genes( mt, gtex, gene_list=None, get_proportions=None, gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path): gtex_table = gtex.key_by("transcript_id") if gene_list: genes = hl.literal(gene_list) # Filter context_ht to genes of interest mt = mt.annotate_rows(in_gene_of_interest=genes.find( lambda x: mt.vep.transcript_consequences.any(lambda tc: tc. gene_symbol == x))) mt = mt.filter_rows(mt.in_gene_of_interest != "NA") # Need to modify process consequences to ignore splice variants, because these can occur on intronic regions all_coding_minus_splice = list( set(all_coding_csqs) - set([ 'splice_acceptor_variant', 'splice_donor_variant', 'splice_region_variant' ])) def add_most_severe_consequence_to_consequence_minus_splice( tc: hl.expr.StructExpression) -> hl.expr.StructExpression: """ Copied from gnomad_hail but slight change """ csqs = hl.literal(all_coding_minus_splice) return tc.annotate(most_severe_consequence=csqs.find( lambda c: tc.consequence_terms.contains(c))) # Add worst consequence within transcript consequences mt = (mt.annotate_rows(vep=mt.vep.annotate( transcript_consequences=mt.vep.transcript_consequences.map( add_most_severe_consequence_to_consequence_minus_splice)))) # Explode on transcript consequences mt = mt.explode_rows(mt.vep.transcript_consequences) mt_kt = mt.rows() # Filter to positions in the CDS regions cds_intervals = hl.import_bed( "gs://gnomad-public/papers/2019-tx-annotation/data/other_data/gencode.v19.CDS.Hail.021519.bed" ) mt_kt = mt_kt.annotate(in_cds=hl.is_defined(cds_intervals[mt_kt.locus])) mt_kt = mt_kt.filter(mt_kt.in_cds) # Filter to protein coding transcripts only mt_kt = mt_kt.filter( mt_kt.vep.transcript_consequences.biotype == "protein_coding") # Filter to coding variants to only evalute those effects mt_kt = filter_table_to_csqs(mt_kt, all_coding_minus_splice) # To avoid double counting transcripts at a given base, key by transcript and position and dedup mt_kt = mt_kt.key_by(mt_kt.locus, mt_kt.vep.transcript_consequences.transcript_id) mt_kt = mt_kt.distinct() # Annotate mt with the gtex values (ie. join them) mt_kt = mt_kt.annotate( tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id]) ## Group by gene, symbol and position ht_sum_of_bases = mt_kt.group_by( locus=mt_kt.locus, ensg=mt_kt.vep.transcript_consequences.gene_id, symbol=mt_kt.vep.transcript_consequences.gene_symbol).aggregate( sum_per_base=hl.agg.array_sum(mt_kt.tx_data.agg_expression)) tissue_ids = sorted([ y.tissue.replace("-", "_").replace(" ", "_").replace("(", "_").replace(")", "_") for y in gtex.values.take(1)[0] ]) d = {tiss: i for i, tiss in enumerate(tissue_ids)} ht_sum_of_bases = ht_sum_of_bases.annotate(**{ tissue: ht_sum_of_bases.sum_per_base[d[tissue]] for tissue in tissue_ids }) if get_proportions: gene_maximums_ht = hl.read_table(gene_maximums_ht_path) ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus) ht_sum_of_bases = ht_sum_of_bases.annotate(alleles="filler") ht_sum_of_bases = get_expression_proportion( tx_table=ht_sum_of_bases, tissues_to_filter=["sum_per_base"], gene_maximum_ht=gene_maximums_ht) ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus) ht_sum_of_bases = ht_sum_of_bases.drop(ht_sum_of_bases.alleles) return ht_sum_of_bases
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') cohorts_pop = hl.import_table( "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels.tsv", delimiter="\t").key_by('s') pca_scores = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/elgh_labels/pop_assignments_test.ht") # pca_loadings = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_loadings.ht") logger.info("assign population pcs") # population_assignment_table = assign_population_pcs( # pca_scores, pca_loadings, known_col="known_pop") pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.pca_scores, known_col="known_pop", n_estimators=100, prop_train=0.8, min_prob=0.5) pop_ht.write( f"{tmp_dir}/ddd-elgh-ukbb/pop_assignments_test_minprob_0.5.ht", overwrite=True) pop_ht.export(
def read_clump_ht(f): ht = hl.read_table(f) ht = ht.drop('idx') return ht
'locus': hl.tlocus(reference_genome='GRCh38'), 'alleles': hl.tarray(hl.tstr) }) ht_final_variants = ht_final_variants.key_by(ht_final_variants.locus, ht_final_variants.alleles) ht_final_pruned_variants = hl.import_table(FINAL_PRUNED_VARIANTS, no_header=True) ht_final_pruned_variants = ht_final_pruned_variants.annotate( **hl.parse_variant(ht_final_pruned_variants.f0, reference_genome='GRCh38')) ht_final_pruned_variants = ht_final_pruned_variants.key_by( ht_final_pruned_variants.locus, ht_final_pruned_variants.alleles) sample_annotations = hl.read_table(PHENOTYPES_TABLE) impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE) annotation_annotations = hl.read_table(ANNOTATION_TABLE) mt = hl.read_matrix_table(MT) mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split') mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key]) mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key]) mt = mt.annotate_rows(annotation=annotation_annotations[mt.row_key]) mt = hl.variant_qc(mt, name='qc')
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from genome-wide association study (GWAS) summary statistics. Given a set or multiple sets of GWAS summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = ld_score_all_phenos_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies (GWAS). n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor) ** 2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta), hl.range(n_blocks))) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)) ** 2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + + mt.__step2_betas[1] * mt.__x_floor) ** 2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]), hl.range(n_blocks))) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected ** 2) - hl.sum(mt.__step2_block_betas_bias_corrected) ** 2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0 / (mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + + mt.__initial_betas[1] * mt.__x_floor) ** 2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected ** 2) - hl.sum(mt.__final_block_betas_bias_corrected) ** 2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M / hl.agg.mean(mt.__n)) ** 2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
p.add_argument("--index-type", help="Elasticsearch index type", required=True) p.add_argument("--num-shards", help="Number of elasticsearch shards", default=1, type=int) p.add_argument("--es-block-size", help="Elasticsearch block size to use when exporting", default=200, type=int) args = p.parse_args() hl.init(log="/tmp/hail.log") print("\n=== Importing Hail table ===") ds = hl.read_table(args.ht_url) print("\n=== Exporting to Elasticsearch ===") es = ElasticsearchClient(args.host, args.port) es.export_table_to_elasticsearch( ds, index_name=args.index_name, index_type_name=args.index_type, block_size=args.es_block_size, num_shards=args.num_shards, delete_index_before_exporting=True, export_globals_to_index_meta=True, verbose=True, )
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format( "expression of '{}'".format( source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) l = construct_variable('l', wrapped_node_t) r = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.int64(tie_breaker(l[0], r[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr(JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_ir(edges.collect(_localize=False)._ir), node_t._parsable_string(), joption(tie_breaker_str))), hl.tset(node_t)) nodes = edges.select(node = [edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node') return nodes
def main(args): ######################################################################## ### initialize phenos = ['crc', 't2d', 'glaucoma', 'afib', 'ra'] renamed = { 's': 's', 'CRC': 'crc', 'T2D': 't2d', 'Glaucoma': 'glaucoma', 'AFib': 'afib', 'RA': 'ra' } phenotype = 'ALL5cc' sumstats_text_file = args.dirname + args.basename + 'ALL5cc.clumped' prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '-pt-sumstats-locus-allele-keyed.kt' contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype contigs = {'0{}'.format(x): str(x) for x in range(1, 10)} bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen' start = time.time() # large block size because we read very little data (due to filtering & ignoring genotypes) # hl.init(branching_factor=10, min_block_size=2000) hl.init() ################################################################################ ### set up the sumstats table (chr, bp for union SNPs) if (args.generate_prs_loci_table): t = hl.import_table(sumstats_text_file, delimiter='\s+', impute=True) t = t.select(locus=hl.locus(hl.str(t.CHR), t.BP)) t = t.key_by('locus') t.write(prs_loci_table_location, overwrite=True) ss = hl.read_table(prs_loci_table_location) ################################################################################ ### Get true phenotypes from UKBB if args.pheno_table: phenotypes = hl.import_table( 'gs://mkanai/disparities/ukb31063.phecode_5diseases.both_sexes.tsv.bgz', key='s', impute=True, types={'s': hl.tstr}) phenotypes = phenotypes.rename(renamed) covariates = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', key='s', impute=True, types={'s': hl.tstr}) samples = covariates.annotate(**phenotypes[covariates.s]) # Write pheno/covar/sample info table for pheno in phenos: gwas_holdout = hl.import_table( 'gs://mkanai/disparities/ukbb/pheno_31063_holdout_gwas_' + pheno + '.info.txt.gz', delimiter='\s+').key_by('s') samples = samples.annotate( **{ pheno + '_holdout': gwas_holdout[samples.s].gwas_holdout == 'holdout' }) samples.write( 'gs://mkanai/disparities/pheno_31063_holdout_gwas_cc_phenos.ht', args.overwrite) if args.ss_tables: # Write ss info for pheno in phenos: print(pheno) ss = hl.import_table(args.dirname + args.basename + pheno + '.*.bgz', delimiter='\s+', impute=True, types={ 'beta': hl.tfloat, 'pval': hl.tfloat, 'pos': hl.tint, 'nCompleteSamples': hl.tint, 'AC': hl.tfloat, 'ytx': hl.tfloat, 'se': hl.tfloat, 'tstat': hl.tfloat }) ss = ss.key_by(locus=hl.locus(hl.str(ss.chr), hl.int( ss.pos))).repartition(200) ss.write(args.dirname + args.basename + pheno + '.ht', True) ################################################################################ ### Run the PRS using phenotype-specific clump variants if args.write_bgen: mt_all = hl.import_bgen( bgen_files, entry_fields=['dosage'], sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', variants=ss.locus) samples = hl.read_table( 'gs://mkanai/disparities/pheno_31063_holdout_gwas_cc_phenos.ht') mt_all = mt_all.annotate_cols(**samples[ mt_all.s]) # ok that phenos keyed on userId not s? mt_all.repartition(5000, shuffle=False).write( args.dirname + args.basename + 'ALL5cc.mt', args.overwrite) mt_all = hl.read_matrix_table(args.dirname + args.basename + 'ALL5cc.mt') for pheno in phenos: #[6:len(phenos)]: print(pheno) ss = hl.read_table(args.dirname + args.basename + pheno + '.ht') """ To add: - Filter only to samples in holdout GWAS - Filter to rows in phenotype-specific clump file - Build PRS for 10 p-value thresholds - Also fix nt1/nt2 to A1 and A2 (check) from sumstats. """ # filter to only samples held out from GWAS mt = mt_all.filter_cols(mt_all[pheno + '_holdout']) mt = mt.annotate_rows(ss=ss[mt.locus]) mt = annotate_beta(mt, mt.ss) # p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1} p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2} pheno_clump = specific_clumps(args.dirname + args.basename + pheno + '.clumped') mt = mt.filter_rows(hl.is_defined(pheno_clump[mt.locus])) print(mt.count()) annot_expr = { k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.pval < v)) for k, v in p_max.items() } mt = mt.annotate_cols(**annot_expr) mt.cols().write(args.dirname + 'UKB_' + pheno + '_PRS.ht', stage_locally=True, overwrite=True) ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_PRS.ht') ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x], *[x for x in phenos if pheno not in x]) output_location = args.dirname + 'UKB_' + pheno + '_PRS.txt.bgz' ht_out.export(output_location) end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
def test_write_stage_locally(self): t = hl.utils.range_table(5) f = new_temp_file(suffix='ht') t.write(f, stage_locally=True) t2 = hl.read_table(f) self.assertTrue(t._same(t2))
def annotate_transcript_consequences(variants_path, transcripts_path, mane_transcripts_path=None): ds = hl.read_table(variants_path) most_severe_consequence = ds.vep.most_severe_consequence transcript_consequences = ds.vep.transcript_consequences # Drop irrelevant consequences transcript_consequences = transcript_consequences.map( lambda c: c.annotate(consequence_terms=c.consequence_terms.filter( lambda t: ~OMIT_CONSEQUENCE_TERMS.contains(t)))).filter( lambda c: c.consequence_terms.size() > 0) # Add/transmute derived fields transcript_consequences = transcript_consequences.map( lambda c: c.annotate(major_consequence=hl.sorted( c.consequence_terms, key=consequence_term_rank)[0]) ).map(lambda c: c.annotate( domains=c.domains.map(lambda domain: domain.db + ":" + domain.name), hgvsc=c.hgvsc.split(":")[-1], hgvsp=hgvsp_from_consequence_amino_acids(c), is_canonical=hl.bool(c.canonical), )) transcript_consequences = transcript_consequences.map(lambda c: c.select( "biotype", "consequence_terms", "domains", "gene_id", "gene_symbol", "hgvsc", "hgvsp", "is_canonical", "lof_filter", "lof_flags", "lof", "major_consequence", "polyphen_prediction", "sift_prediction", "transcript_id", )) transcripts = hl.read_table(transcripts_path) transcript_info = hl.dict([ (row.transcript_id, row.transcript_info) for row in transcripts.select(transcript_info=hl.struct( transcript_version=transcripts.transcript_version, gene_version=transcripts.gene.gene_version, )).collect() ]) transcript_consequences = transcript_consequences.map( lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id))) if mane_transcripts_path: mane_transcripts = hl.read_table(mane_transcripts_path) mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id")) for row in mane_transcripts.collect()]) transcript_consequences = transcript_consequences.map( lambda csq: csq.annotate(**hl.rbind( mane_transcripts.get(csq.gene_id), lambda mane_transcript: (hl.case().when( (mane_transcript.ensembl_id == csq.transcript_id) & (mane_transcript.ensembl_version == csq. transcript_version), hl.struct( is_mane_select=True, is_mane_select_version=True, refseq_id=mane_transcript.refseq_id, refseq_version=mane_transcript.refseq_version, ), ).when( mane_transcript.ensembl_id == csq.transcript_id, hl.struct( is_mane_select=True, is_mane_select_version=False, refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr), ), ).default( hl.struct( is_mane_select=False, is_mane_select_version=False, refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr), ))), ))) transcript_consequences = hl.sorted( transcript_consequences, lambda c: ( hl.if_else( c.biotype == "protein_coding", 0, 1, missing_false=True), hl.if_else(c.major_consequence == most_severe_consequence, 0, 1, missing_false=True), hl.if_else(c.is_mane_select, 0, 1, missing_false=True), hl.if_else(c.is_canonical, 0, 1, missing_false=True), ), ) else: transcript_consequences = hl.sorted( transcript_consequences, lambda c: ( hl.if_else( c.biotype == "protein_coding", 0, 1, missing_false=True), hl.if_else(c.major_consequence == most_severe_consequence, 0, 1, missing_false=True), hl.if_else(c.is_canonical, 0, 1, missing_false=True), ), ) ds = ds.annotate( transcript_consequences=transcript_consequences).drop("vep") return ds
def main(args): ######################################################################## ### initialize which_beta = 'beta' + args.which_beta if args.method == 'metal': end_dir = 'metal/' clumps = args.dirname + end_dir + 'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_beta_' + args.which_beta + '_' + args.iter + '.metal.clumped' ss_filename = args.dirname + end_dir + 'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_beta_' + args.which_beta + '_' + args.iter + '.tsv' out_base = args.dirname + end_dir + which_beta + '_draw_' + args.iter + '_spike_' + args.which_beta + '_metal_PRS' elif args.method == 'mama': ld = args.ld + '/' analysis = args.analysis + '/' end_dir = 'mama/ld_true/' clumps = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2.clumped' ss_filename = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2.txt' # this ss_filename has different headers out_base = args.dirname + end_dir + ld + analysis + 'draw_' + args.iter + '_spike_' + args.which_beta + '_mama_2_PRS' else: end_dir = 'ukbb_only/' clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped' ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz' out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS' clump_table_location = args.dirname + 'keytables/ukb-' + args.basename + '-pt-sumstats-locus-allele-keyed.kt' contigs = {'0{}'.format(x): str(x) for x in range(1, 10)} bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr22_v3.bgen' start = time.time() # large block size because we read very little data (due to filtering & ignoring genotypes) hl.init(branching_factor=10, min_block_size=2000) # set min_block_size only in import_bgen ################################################################################ ### set up the sumstats table (chr, bp for union SNPs) if args.read_clumps: clump_file = hl.import_table(clumps, delimiter='\s+', impute=True) clump_file = clump_file.select( locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP)) clump_file = clump_file.key_by('locus') clump_file.write(clump_table_location, overwrite=True) clump_file = hl.read_table(clump_table_location) ################################################################################ ### Write ss info, process so sumstats are uniform across MAMA, METAL, and gwas if args.ss_tables: #ss = hl.import_table(args.dirname + args.basename + '.tsv.gz', ss = hl.import_table( ss_filename, #'BBJ_UKBB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.which_beta + 'beta_01_9.tsv' # # for mama case delimiter='\s+', impute=True, types={'BP': hl.tint}) if args.method != 'mama' and args.method != 'metal': ss = ss.rename({ 'chr': 'CHR', 'pos': 'BP', 'rsid': 'SNP', 'ref': 'A1', 'alt': 'A2', 'maf': 'FRQ', 'p_value_beta_' + args.which_beta: 'MAMA_PVAL', 'standard_error_beta_' + args.which_beta: 'MAMA_SE', 'beta_beta_' + args.which_beta: 'MAMA_BETA' }) ss = ss.key_by( locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200) ss = ss.annotate(A1=ss.A1.upper(), A2=ss.A2.upper()) ss.write(args.dirname + args.basename + '_sep.ht', True) ss = hl.read_table(args.dirname + args.basename + '_sep.ht') ################################################################################ ### Run the PRS using phenotype-specific clump variants if args.write_bgen: mt_all = hl.import_bgen( bgen_files, ['dosage'], sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', variants=clump_file.locus) samples = hl.import_table(args.dirname + 'ukb_not_in_simulation_rand5000.inds', types={ 's': hl.tstr }).key_by('s') mt = mt_all.filter_cols(hl.is_defined(samples[mt_all.s])) mt.repartition(5000, shuffle=False).write( args.dirname + args.basename + '.mt', True) mt = hl.read_matrix_table(args.dirname + args.basename + '.mt') true_ss = hl.read_table(args.dirname + 'BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht') """ To add: - Also fix nt1/nt2 to A1 and A2 (check) from sumstats. """ # filter to only samples held out from GWAS mt = mt.annotate_rows(ss=ss[mt.locus]) mt = annotate_beta(mt, mt.ss) p_max = { 's1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1 } pheno_clump = specific_clumps(clumps) mt = mt.filter_rows(pheno_clump.get(mt.locus, False)) print(mt.count()) # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL) # sqrt(2pq) if args.betas_are_standardized: annot_expr = { k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) * (1 - hl.float(mt.ss.FRQ))) * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } else: annot_expr = { k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } mt = mt.annotate_cols(**annot_expr, **true_ss[mt.s]) mt.key_cols_by().cols().write(out_base + '.ht', stage_locally=True, overwrite=True) ht = hl.read_table(out_base + '.ht') output_location = out_base + '.txt.bgz' ht.export(output_location) end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
# s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') cohorts_pop = hl.import_table( "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels_updated.tsv", delimiter="\t").key_by('s') # Read mt mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/new_labels/chr1_chr20_ldpruned_updated.mt") # pca_scores_pop pca_scores_pop = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020.ht") ''' # pca_scores_superpop pca_scores_superpop = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020_superpops.ht") # annotate mt with pop and superpop mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop) mt = mt.annotate_cols(assigned_superpop=pca_scores_superpop[mt.s].pop) # do sample_qc # calculate and annotate with metric heterozygosity mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc') mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate( heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called))
ht_mfi['chrom'], hl.str(ht_mfi['position']), ht_mfi['allele1_ref'], ht_mfi['allele2_alt'] ]), delimiter=':')) # prep to merge with GWAS variant list ht_mfi = ht_mfi.key_by('variant') ht_mfi = ht_mfi.annotate(maf=hl.float(ht_mfi.maf), info=hl.float(ht_mfi.info)) ht_mfi = ht_mfi.select('varid', 'rsid', 'maf', 'info') ####### # load GWAS variant list ####### # get GWAS variant list ht_sites = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht') ht_sites = ht_sites.annotate( variant=hl.variant_str(ht_sites.locus, ht_sites.alleles)) ht_sites = ht_sites.key_by('variant') ######## # merge and save ######## # get final merged file with maf/info of the gwas variants ht = ht_mfi.join(ht_sites, how='inner') ht = ht.select('locus', 'alleles', 'varid', 'rsid', 'maf', 'info') print(ht.count()) # save both ht and tsv ht.write('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.ht',
def load_dataset(name, version, reference_genome, config_file='gs://hail-datasets/datasets.json'): """Load a genetic dataset from Hail's repository. Example ------- >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes', # doctest: +SKIP ... version='phase3', ... reference_genome='GRCh38') Parameters ---------- name : :obj:`str` Name of the dataset to load. version : :obj:`str` Version of the named dataset to load (see available versions in documentation). reference_genome : `GRCh37` or `GRCh38` Reference genome build. Returns ------- :class:`.Table` or :class:`.MatrixTable`""" with hl.hadoop_open(config_file, 'r') as f: datasets = json.load(f) names = set([dataset['name'] for dataset in datasets]) if name not in names: raise ValueError('{} is not a dataset available in the repository.'.format(repr(name))) versions = set([dataset['version'] for dataset in datasets if dataset['name']==name]) if version not in versions: raise ValueError("""Version {0} not available for dataset {1}. Available versions: {{{2}}}.""".format(repr(version), repr(name), repr('","'.join(versions)))) reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name]) if reference_genome not in reference_genomes: raise ValueError("""Reference genome build {0} not available for dataset {1}. Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome), repr(name), '\',\''.join((reference_genomes)))) path = [dataset['path'] for dataset in datasets if all([dataset['name']==name, dataset['version']==version, dataset['reference_genome']==reference_genome])][0].strip('/') if path.endswith('.ht'): dataset = hl.read_table(path) else: if not path.endswith('.mt'): raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path))) dataset = hl.read_matrix_table(path) return dataset
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') # save relatedness estimates for pc_relate global populations ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL) related_samples = ht.filter(ht.kin > 0.1) pc_relate_global = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_global_matrix.csv', 'analysis') pc_relate_global.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_global_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for pc_relate NFE samples ht = hl.read_table(PC_RELATE_ESTIMATE_NFE) related_samples = ht.filter(ht.kin > 0.1) pc_relate_nfe = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis') pc_relate_nfe.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for KING NFE samples mt = hl.read_matrix_table(KING_ESTIMATE_NFE) ht = mt.entries() # remove entries where samples are identical related_samples = ht.filter(ht.s_1 != ht.s) related_samples = ht.filter(ht.phi > 0.1) king_nfe = pd.DataFrame({ 'i_s': related_samples.s_1.collect(), 'j_s': related_samples.s.collect(), 'kin': related_samples.phi.collect(), }) filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis') king_nfe.to_csv(filename, index=False) # save KING NFE maximal independent set second_degree_related_samples = ht.filter( (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=second_degree_related_samples.s_1, j=second_degree_related_samples.s) struct = struct.annotate(phi=second_degree_related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) related_samples = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()}) filename = output_path( f'king_90k_related_samples_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False)