def test_aggregate_ir(self): ds = (hl.utils.range_matrix_table(5, 5) .annotate_globals(g1=5) .annotate_entries(e1=3)) x = [("col_idx", lambda e: ds.aggregate_cols(e)), ("row_idx", lambda e: ds.aggregate_rows(e))] for name, f in x: r = f(hl.struct(x=agg.sum(ds[name]) + ds.g1, y=agg.filter(ds[name] % 2 != 0, agg.sum(ds[name] + 2)) + ds.g1, z=agg.sum(ds.g1 + ds[name]) + ds.g1, mean=agg.mean(ds[name]))) self.assertEqual(convert_struct_to_dict(r), {u'x': 15, u'y': 13, u'z': 40, u'mean': 2.0}) r = f(5) self.assertEqual(r, 5) r = f(hl.null(hl.tint32)) self.assertEqual(r, None) r = f(agg.filter(ds[name] % 2 != 0, agg.sum(ds[name] + 2)) + ds.g1) self.assertEqual(r, 13) r = ds.aggregate_entries(agg.filter((ds.row_idx % 2 != 0) & (ds.col_idx % 2 != 0), agg.sum(ds.e1 + ds.g1 + ds.row_idx + ds.col_idx)) + ds.g1) self.assertTrue(r, 48)
def test_aggregate_ir(self): kt = hl.utils.range_table(10).annotate_globals(g1=5) r = kt.aggregate(hl.struct(x=agg.sum(kt.idx) + kt.g1, y=agg.filter(kt.idx % 2 != 0, agg.sum(kt.idx + 2)) + kt.g1, z=agg.sum(kt.g1 + kt.idx) + kt.g1)) self.assertEqual(convert_struct_to_dict(r), {u'x': 50, u'y': 40, u'z': 100}) r = kt.aggregate(5) self.assertEqual(r, 5) r = kt.aggregate(hl.null(hl.tint32)) self.assertEqual(r, None) r = kt.aggregate(agg.filter(kt.idx % 2 != 0, agg.sum(kt.idx + 2)) + kt.g1) self.assertEqual(r, 40)
def hwe_normalize(call_expr): mt = matrix_table_source('hwe_normalize/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "hwe_normalize: found 0 variants after filtering out monomorphic sites." ) info( f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else( (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return normalized_gt
def test_select_cols(self): mt = hl.utils.range_matrix_table(3, 5, n_partitions=4) mt = mt.annotate_entries(e=mt.col_idx * mt.row_idx) mt = mt.annotate_globals(g=1) mt = mt.annotate_cols(sum=agg.sum(mt.e + mt.col_idx + mt.row_idx + mt.g) + mt.col_idx + mt.g, count=agg.count_where(mt.e % 2 == 0), foo=agg.count()) result = convert_struct_to_dict(mt.cols().collect()[-2]) self.assertEqual(result, {'col_idx': 3, 'sum': 28, 'count': 2, 'foo': 3})
def hwe_normalized_pca(dataset, k=10, compute_loadings=False, as_array=False): """Run principal component analysis (PCA) on the Hardy-Weinberg-normalized call matrix. Examples -------- >>> eigenvalues, scores, loadings = methods.hwe_normalized_pca(dataset, k=5) Notes ----- Variants that are all homozygous reference or all homozygous variant are removed before evaluation. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. as_array : :obj:`bool` If ``True``, return scores and loadings as an array field. If ``False``, return one field per element (`PC1`, `PC2`, ... `PCk`). Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()), n_called=agg.count_where( functions.is_defined(dataset.GT))) dataset = dataset.filter_rows( (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist() n_variants = dataset.count_rows() if n_variants == 0: raise FatalError( "Cannot run PCA: found 0 variants after filtering out monomorphic sites." ) info("Running PCA using {} variants.".format(n_variants)) entry_expr = functions.bind( dataset.AC / dataset.n_called, lambda mean_gt: functions.cond( functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles( ) - mean_gt) / functions.sqrt(mean_gt * (2 - mean_gt) * n_variants / 2), 0)) result = pca(entry_expr, k, compute_loadings, as_array) dataset.unpersist() return result
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
def test_query(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) results = kt.aggregate(hl.Struct(q1=agg.sum(kt.b), q2=agg.count(), q3=agg.collect(kt.e), q4=agg.collect(agg.filter((kt.d >= 5) | (kt.a == 0), kt.e)))) self.assertEqual(results.q1, 8) self.assertEqual(results.q2, 3) self.assertEqual(set(results.q3), {"hello", "cat", "dog"}) self.assertEqual(set(results.q4), {"hello", "cat"})
def test_aggregate1(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]}, {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []}, {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}] kt = hl.Table.parallelize(rows, schema) results = kt.aggregate(hl.Struct(q1=agg.sum(kt.b), q2=agg.count(), q3=agg.collect(kt.e), q4=agg.filter((kt.d >= 5) | (kt.a == 0), agg.collect(kt.e)), q5=agg.explode(lambda elt: agg.mean(elt), kt.f))) self.assertEqual(results.q1, 8) self.assertEqual(results.q2, 3) self.assertEqual(set(results.q3), {"hello", "cat", "dog"}) self.assertEqual(set(results.q4), {"hello", "cat"}) self.assertAlmostEqual(results.q5, 4)
def _make_tsm_from_call(call_expr, block_size, mean_center=False, hwe_normalize=False): mt = matrix_table_source('_make_tsm/entry_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) if mean_center or hwe_normalize: mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined( mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "_make_tsm: found 0 variants after filtering out monomorphic sites." ) info( f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.unfilter_entries() mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0)) if hwe_normalize: mt = mt.annotate_rows( __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev) else: mt = mt.select_entries(__x=mt.__gt) A, ht = mt_to_table_of_ndarray(mt.__x, block_size, return_checkpointed_table_also=True) A = A.persist() return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
def test_grm(self): tolerance = 0.001 def load_id_file(path): ids = [] with hl.hadoop_open(path) as f: for l in f: r = l.strip().split('\t') self.assertEqual(len(r), 2) ids.append(r[1]) return ids def load_rel(ns, path): rel = np.zeros((ns, ns)) with hl.hadoop_open(path) as f: for i, l in enumerate(f): for j, n in enumerate(map(float, l.strip().split('\t'))): rel[i, j] = n self.assertEqual(j, i) self.assertEqual(i, ns - 1) return rel def load_grm(ns, nv, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path) as f: i = 0 for l in f: row = l.strip().split('\t') self.assertEqual(int(row[2]), nv) m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3]) i += 1 self.assertEqual(i, ns * (ns + 1) / 2) return m def load_bin(ns, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path, 'rb') as f: for i in range(ns): for j in range(i + 1): b = f.read(4) self.assertEqual(len(b), 4) m[i, j] = unpack('<f', bytearray(b))[0] left = f.read() self.assertEqual(len(left), 0) return m b_file = utils.new_temp_file(prefix="plink") rel_file = utils.new_temp_file(prefix="test", suffix="rel") rel_id_file = utils.new_temp_file(prefix="test", suffix="rel.id") grm_file = utils.new_temp_file(prefix="test", suffix="grm") grm_bin_file = utils.new_temp_file(prefix="test", suffix="grm.bin") grm_nbin_file = utils.new_temp_file(prefix="test", suffix="grm.N.bin") dataset = self.get_dataset() n_samples = dataset.count_cols() dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.n_alt_alleles()), n_called=agg.count_where(hl.is_defined(dataset.GT))) dataset = dataset.filter_rows((dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)) dataset = dataset.filter_rows(dataset.n_called == n_samples).persist() hl.export_plink(dataset, b_file, id=dataset.s) sample_ids = [row.s for row in dataset.cols().select('s').collect()] n_variants = dataset.count_rows() self.assertGreater(n_variants, 0) grm = hl.genetic_relatedness_matrix(dataset) grm.export_id_file(rel_id_file) ############ ### rel p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-rel --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".rel.id"), sample_ids) grm.export_rel(rel_file) self.assertEqual(load_id_file(rel_id_file), sample_ids) self.assertTrue(np.allclose(load_rel(n_samples, p_file + ".rel"), load_rel(n_samples, rel_file), atol=tolerance)) ############ ### gcta-grm p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-gz --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm(grm_file) self.assertTrue(np.allclose(load_grm(n_samples, n_variants, p_file + ".grm.gz"), load_grm(n_samples, n_variants, grm_file), atol=tolerance)) ############ ### gcta-grm-bin p_file = utils.new_temp_file(prefix="plink") syscall('''plink --bfile {} --make-grm-bin --out {}''' .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL) self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids) grm.export_gcta_grm_bin(grm_bin_file, grm_nbin_file) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.bin"), load_bin(n_samples, grm_bin_file), atol=tolerance)) self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.N.bin"), load_bin(n_samples, grm_nbin_file), atol=tolerance))
def grm(dataset): """Compute the Genetic Relatedness Matrix (GRM). .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- >>> km = methods.grm(dataset) Notes ----- The genetic relationship matrix (GRM) :math:`G` encodes genetic correlation between each pair of samples. It is defined by :math:`G = MM^T` where :math:`M` is a standardized version of the genotype matrix, computed as follows. Let :math:`C` be the :math:`n \\times m` matrix of raw genotypes in the variant dataset, with rows indexed by :math:`n` samples and columns indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes genotype variances to a common value :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further motivated in the paper `Patterson, Price and Reich, 2006 <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__. (The resulting amplification of signal from the low end of the allele frequency spectrum will also introduce noise for rare variants; common practice is to filter out variants with minor allele frequency below some cutoff.) The factor :math:`1/m` gives each sample row approximately unit total variance (assuming linkage equilibrium) so that the diagonal entries of the GRM are approximately 1. Equivalently, .. math:: G_{ik} = \\frac{1}{m} \\sum_{j=1}^m \\frac{(C_{ij}-2p_j)(C_{kj}-2p_j)}{2 p_j (1-p_j)} Warning ------- Since Hardy-Weinberg normalization cannot be applied to variants that contain only reference alleles or only alternate alleles, all such variants are removed prior to calcularing the GRM. Parameters ---------- dataset : :class:`.MatrixTable` Dataset to sample from. Returns ------- :class:`genetics.KinshipMatrix` Genetic Relatedness Matrix for all samples. :rtype: """ dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()), n_called=agg.count_where( functions.is_defined(dataset.GT))) dataset = dataset.filter_rows( (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist() n_variants = dataset.count_rows() if n_variants == 0: raise FatalError( "Cannot run GRM: found 0 variants after filtering out monomorphic sites." ) info("Computing GRM using {} variants.".format(n_variants)) normalized_genotype_expr = functions.bind( dataset.AC / dataset.n_called, lambda mean_gt: functions.cond( functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles( ) - mean_gt) / functions.sqrt(mean_gt * (2 - mean_gt) * n_variants / 2), 0)) bm = BlockMatrix.from_matrix_table(normalized_genotype_expr) dataset.unpersist() grm = bm.T.dot(bm) return KinshipMatrix._from_block_matrix( dataset.colkey_schema, grm, [row.s for row in dataset.cols_table().select('s').collect()], n_variants)