def get_test_genotypes_bm(chrom, genotype_bm_path): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) # if chrom == 'all': mt = get_filtered_mt_with_x() else: mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', )) mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) # if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'): # samples = mt.s.take(10) # mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) # mt = mt.key_cols_by(userId=hl.int32(mt.s)) # mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True) # else: # samples_ht = hl.read_table(genotype_samples_ht_path) controls = hl.read_table(f'{scratch_dir}/genotype_samples_n10.ht') cases = hl.read_table(f'{scratch_dir}/genotype_samples_n10_cases.ht') samples_ht = cases.union(controls) mt = mt.filter_cols(hl.is_defined(samples_ht[hl.int32(mt.s)])) mt = mt.key_cols_by(userId=hl.int32(mt.s)) print(mt.count()) mt = mt.select_cols().select_rows() mt = mt.repartition(1000) BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, overwrite=True)
def make_sumstats_bm(sumstats_bm_path, high_quality): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) clump_mt = hl.read_matrix_table( get_clumping_results_path(high_quality_only=high_quality)).rename( {'pop': 'clump_pops'}) mt = all_axis_join(meta_mt, clump_mt) mt = separate_results_mt_by_pop(mt, 'clump_pops', 'plink_clump', skip_drop=True) mt = separate_results_mt_by_pop(mt, 'meta_analysis_data', 'meta_analysis', skip_drop=True) mt = mt.filter_cols(mt.meta_analysis_data.pop == mt.clump_pops) mt = explode_by_p_threshold(mt).unfilter_entries() mt = mt.filter_cols((mt.description == 'Type 2 diabetes') & (mt.p_threshold == 1)) BlockMatrix.write_from_entry_expr(hl.or_else( mt.meta_analysis.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.meta_analysis.Pvalue < mt.p_threshold), 0.0), sumstats_bm_path, overwrite=True)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) # non-field expressions currently take a separate code path path2 = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x + 1, path2) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2)) BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True) self._assert_eq(BlockMatrix.read(path2), bm + 2)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) with hl.TemporaryDirectory(ensure_exists=False) as path: # non-field expressions currently take a separate code path BlockMatrix.write_from_entry_expr(mt.x + 1, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path)) BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm + 2)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
import hail as hl from hail.linalg import BlockMatrix mt = hl.read_matrix_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt') mt = mt.filter_cols(mt.super_population == 'EUR') mt = hl.variant_qc(mt) mt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001) & (mt.variant_qc.AF[1] > 0.001)) BlockMatrix.write_from_entry_expr( entry_expr=mt.GT.n_alt_alleles(), path= 'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm', mean_impute=True, center=False, normalize=False, block_size=4096, overwrite=True) bm = BlockMatrix.read( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_European_autosomes_maf_gt_001.bm' ) metadata = hl.struct(name='1000_Genomes_phase3_European_autosomes_maf_gt_001', reference_genome='GRCh37', n_rows=bm.n_rows, n_cols=bm.n_cols, block_size=bm.block_size)
def _test_linear_mixed_model_low_rank(self): seed = 0 n_populations = 8 fst = n_populations * [.9] n_samples = 500 n_variants = 200 n_orig_markers = 100 n_culprits = 10 n_covariates = 3 sigma_sq = 1 tau_sq = 1 from numpy.random import RandomState prng = RandomState(seed) x = np.hstack((np.ones(shape=(n_samples, 1)), prng.normal(size=(n_samples, n_covariates - 1)))) mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants, fst=fst, af_dist=hl.rand_unif(0.1, 0.9, seed=seed), seed=seed) pa_t_path = utils.new_temp_file(suffix='bm') a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path) a = BlockMatrix.read(a_t_path).T.to_numpy() g = a[:, -n_orig_markers:] g_std = self._filter_and_standardize_cols(g) n_markers = g_std.shape[1] k = (g_std @ g_std.T) * n_samples / n_markers beta = np.arange(n_covariates) beta_stars = np.array([1] * n_culprits) y = prng.multivariate_normal( np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)), sigma_sq * k + tau_sq * np.eye(n_samples)) # low rank computation of S, P l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n_samples / n_markers) p = (g_std @ (v / np.sqrt(sl))).T # compare with full rank S, P sk0, uk = np.linalg.eigh(k) sk = sk0[-n_eigenvectors:] pk = uk[:, -n_eigenvectors:].T assert np.allclose(sk, s) assert np.allclose(np.abs(pk), np.abs(p)) # build and fit model py = p @ y px = p @ x pa = p @ a model = LinearMixedModel(py, px, s, y, x) assert model.n == n_samples assert model.f == n_covariates assert model.r == n_eigenvectors assert model.low_rank model.fit() # check effect sizes tend to be near 1 for first n_marker alternative models BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True) df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1 # compare NumPy and Hail LMM per alternative df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas() assert np.min(df_numpy['chi_sq']) > 0 na_numpy = df_numpy.isna().any(axis=1) na_lmm = df_lmm.isna().any(axis=1) assert na_numpy.sum() <= 10 assert na_lmm.sum() <= 10 assert np.logical_xor(na_numpy, na_lmm).sum() <= 5 mask = ~(na_numpy | na_lmm) lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask])) assert lmm_vs_numpy_p_value[10] < 1e-12 # 10 least p-values differences assert lmm_vs_numpy_p_value[-1] < 1e-8 # all p-values
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def main(args): pop = args.pop num_pcs = 10 basic_covars = ['sex', 'age', 'age2', 'age_sex', 'age2_sex'] covariates = basic_covars + [f'PC{x}' for x in range(1, num_pcs + 1)] tmp_mt_path = f'{temp_bucket_7day}/{pop}.mt' tmp_bm_path = f'{temp_bucket_7day}/{pop}.bm' if args.write_mt: mt = get_filtered_mt(chrom='all', pop=pop, entry_fields=['dosage'], min_mac=19, filter_mac_instead_of_ac=True) mt_x = get_filtered_mt(chrom='X', pop=pop, entry_fields=['dosage'], min_mac=19, filter_mac_instead_of_ac=True) mt = mt.union_rows(mt_x) mt = mt.annotate_rows(AF=hl.agg.mean(mt.dosage) / 2) mt = mt.checkpoint(tmp_mt_path, overwrite=args.overwrite) n = mt.count()[1] # write variant indexes ht = mt.rows().select().add_index() ht = ht.annotate_globals(n_samples=n, pop=pop) ht.write(get_ld_variant_index_path(pop), overwrite=args.overwrite) else: mt = hl.read_matrix_table(tmp_mt_path) n = mt.count()[1] if args.write_bm: # convert mt to bm BlockMatrix.write_from_entry_expr(mt.dosage, tmp_bm_path, mean_impute=True, center=False, normalize=False, overwrite=args.overwrite) bm = BlockMatrix.read(tmp_bm_path) if args.compute_ld_matrix: print(f'BlockMatrix shape: {bm.shape}') # mean-center and normalize bm bm_norm = normalize_bm(bm) bm_norm = checkpoint_tmp(bm_norm) # take covariates (with intercept), make hat bms for FWL projection cov = mt.cols().select(*covariates).to_pandas().drop(['s'], axis=1) cov['Intercept'] = 1.0 hat1 = cov.values hat2 = np.dot(np.linalg.inv(np.dot(cov.transpose(), cov)), cov.transpose()) bm_hat1 = checkpoint_tmp(BlockMatrix.from_numpy(hat1)) bm_hat2 = checkpoint_tmp(BlockMatrix.from_numpy(hat2)) # Cov-adjustement; conducting in three steps due to huge matrix operation bm_Z = checkpoint_tmp(bm_norm @ bm_hat1) bm_Z = checkpoint_tmp(bm_Z @ bm_hat2) bm_Z = checkpoint_tmp(bm_norm - bm_Z) # compute ld matrix with a specified radius bm_ldadj = (bm_Z @ bm_Z.T) / n starts_and_stops = hl.linalg.utils.locus_windows(mt.locus, radius=args.radius, _localize=False) bm_ldadj = bm_ldadj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) # sparcify to a triangle matrix bm_ldadj = bm_ldadj.sparsify_triangle() bm_ldadj = bm_ldadj.checkpoint(get_ld_matrix_path(pop), overwrite=args.overwrite, force_row_major=True) else: bm_ldadj = BlockMatrix.read(get_ld_matrix_path(pop)) if args.write_ldsc_hm3_snplist: # Note: currently, this writes snplists for all the populations at once write_ldsc_hm3_snplist(overwrite=args.overwrite) if args.compute_ldscore: ht_ldscore = copmute_ldscore(mt.rows(), bm_ldadj, n, radius=args.ld_score_radius, out_name=get_ld_score_ht_path(pop), overwrite=args.overwrite) export_ldscore(ht_ldscore, pop)
def main(args): hl.init(default_reference='GRCh37', log='/prs.log', spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO', 'spark.hadoop.fs.gs.requester.pays.project.id': 'ukbb-diversepops-neale'}) if args.prepare_sumstats_matrix: # get meta mt and separate by pop combo meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) meta_mt = separate_results_mt_by_pop(meta_mt, 'meta_analysis_data', 'meta_analysis') meta_mt = meta_mt.annotate_cols(clump_pops=meta_mt.meta_analysis_data.pop) meta_mt = meta_mt.key_cols_by('clump_pops', *meta_mt.col_key) # get sumstats mt and separate by pop combo ss_mt = get_final_sumstats_mt_for_export() ss_mt = separate_results_mt_by_pop(ss_mt, 'pheno_data', 'summary_stats') ss_mt = ss_mt.annotate_cols(clump_pops=hl.array([ss_mt.pheno_data.pop])) ss_mt = ss_mt.key_cols_by(*meta_mt.col_key) # join meta results and sumstats mt # NOTE: union_cols() requires the same entry fields schema meta_mt = meta_mt.select_entries(BETA = meta_mt.meta_analysis.BETA, Pvalue = meta_mt.meta_analysis.Pvalue).select_cols().select_rows() ss_mt = ss_mt.select_entries(BETA = ss_mt.summary_stats.BETA, Pvalue = ss_mt.summary_stats.Pvalue).select_cols().select_rows() mt = meta_mt.union_cols(ss_mt) # filter to distinct cols # NOTE: distinct_by_col() does not allow a col key of type `list` mt = mt.annotate_cols(clump_pops_str = hl.delimit(mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in mt.col_key if k!='clump_pops']).distinct_by_col() mt = mt.distinct_by_col() # ensure that betas are not missing ss_mt = ss_mt.annotate_cols(clump_pops_str = hl.delimit(ss_mt.clump_pops)).key_cols_by('clump_pops_str', *[k for k in ss_mt.col_key if k!='clump_pops']) mt = mt.annotate_entries(BETA = hl.or_else(mt.BETA, ss_mt[mt.row_key, mt.col_key].BETA), Pvalue = hl.or_else(mt.Pvalue, ss_mt[mt.row_key, mt.col_key].Pvalue)) # read clump mt and separate by pop combo clump_mt = hl.read_matrix_table(get_clumping_results_path(high_quality=args.high_quality, max_pops=args.max_pops)) if args.max_pops: # if max_pops=True, the clump_mt is already separated by pop # these steps are necessary to make downstream code usable for both max_pops=True/False clump_mt = clump_mt.annotate_entries(plink_clump = hl.struct(TOTAL = clump_mt.TOTAL)) clump_mt = clump_mt.annotate_cols(pop_index = 0) else: clump_mt = separate_results_mt_by_pop(clump_mt, 'clump_pops', 'plink_clump', skip_drop=True) clump_mt = clump_mt.annotate_cols(clump_pops_str = hl.delimit(clump_mt.clump_pops)) clump_mt = clump_mt.drop('clump_pops').key_cols_by(*mt.col_key) # join sumstats/meta-analysis with clump mt mt = all_axis_join(mt, clump_mt) mt = mt.filter_cols(hl.is_defined(mt.pop_index)) print(f'\n\nMatrix dimensions (before explode by p-threshold): {mt.count()}\n') mt = explode_by_p_threshold(mt).unfilter_entries() # Write pheno data for later use mt.add_col_index('idx').key_cols_by('idx').cols().write( get_clump_sumstats_col_ht_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) BlockMatrix.write_from_entry_expr( hl.or_else(mt.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.Pvalue < mt.p_threshold), 0.0), get_clump_sumstats_bm_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) # 2020-06-25 01:49:32 Hail: INFO: Wrote all 7078 blocks of 28987534 x 3530 matrix with block size 4096. # If clump_mt is significantly smaller than meta_mt, consider putting that on the left of the join, # then filter the genotype matrix to only those SNPs (pilot would go from 28.9M -> 21.2M) if args.prepare_genotype_matrix: meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) mt = get_filtered_mt_with_x() mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) # Write sample data for later use mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt.cols().add_index().write(genotype_samples_ht_path, args.overwrite) BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, args.overwrite) # 2020-06-25 19:18:14 Hail: INFO: Wrote all 764424 blocks of 28987534 x 441345 matrix with block size 4096. if args.compute_prs: sumstats_bm = BlockMatrix.read(get_clump_sumstats_bm_path(high_quality=args.high_quality, max_pops=args.max_pops)) genotype_bm = BlockMatrix.read(genotype_bm_path) mul_splits = 197 # sumstats_bm.shape[1]//10000*10 sum_splits = 20 #int(mul_splits/10) assert mul_splits>10 # if not more than 10, sum_splits is not necessary prs_bm = tree_matmul_tree_matsum(genotype_bm.T, sumstats_bm, mul_splits=mul_splits, sum_splits=sum_splits, path_prefix = f'{temp_bucket}/prs/tree_matmul{"_max_pops" if args.max_pops else ""}', read_if_exists = True) prs_bm.write(get_prs_bm_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) if args.create_prs_mt: prs_bm = BlockMatrix.read(get_prs_bm_path(high_quality=args.high_quality, max_pops=args.max_pops)) pheno_ht = hl.read_table(get_clump_sumstats_col_ht_path(high_quality=args.high_quality, max_pops=args.max_pops)).key_by('idx') samples_ht = hl.read_table(genotype_samples_ht_path).key_by('idx') # 10k partitions for 370 GB table (441k x 108k) = 37 MB/partition # 5014 partitions for 240 GB table (441k x 72k) = 48 MB/partition (max_pops) n_partitions = 15000 #int(1000*(pheno_ht.count()/72*5)//1000) # or hard code mt = BlockMatrix.to_matrix_table_row_major(prs_bm, n_partitions=n_partitions).rename({'element': 'score'}) mt = mt.annotate_cols(**pheno_ht[mt.col_key]).key_cols_by(*PHENO_KEY_FIELDS) mt = mt.annotate_rows(**samples_ht[mt.row_key]).key_rows_by('userId') mt.write(get_prs_mt_path(high_quality=args.high_quality, max_pops=args.max_pops), args.overwrite) if args.assess_prs: prs_mt = hl.read_matrix_table(get_prs_mt_path(high_quality=args.high_quality, max_pops=args.max_pops)) pheno_mt = get_ukb_pheno_mt() # TODO: fix all phenos to new keying scheme pheno_mt = pheno_mt.key_cols_by( **pheno_mt.col_key.annotate(modifier=hl.if_else(pheno_mt.trait_type == "biomarkers", "irnt", pheno_mt.modifier))) mt = prs_mt.annotate_entries(**pheno_mt[prs_mt.row_key, prs_mt.col_key]) mt = mt.annotate_cols(description = pheno_mt.cols()[mt.col_key].description) for pop in POPS: mt_pop = mt.filter_rows(mt.pop==pop) mt_pop = mt_pop.annotate_cols(prs_corr=hl.agg.linreg(mt_pop.both_sexes, [1.0, mt_pop.score])) cols = mt_pop.cols() cols.select('description', 'p_threshold', clump_pops_str=hl.delimit(cols.clump_pops,'-'), prs_corr_r2=cols.prs_corr.multiple_r_squared, prs_corr_pval=cols.prs_corr.p_value[1], prs_corr_n=cols.prs_corr.n).export(f'gs://ukbb-diverse-temp-30day/prs/assess_prs{"_max_pops" if args.max_pops else ""}.{pop}.tsv.gz')