def test_operations(): N = 10 R = rng(12) def mkrow(): return (np.random.choice([False, True]), np.random.choice([False, True]), np.random.random(), np.random.choice([False, True])) dtype = [('', bool), ('', bool), ('', float), ('', bool)] # non-masked data data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bb, nich, bb]) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 2.0 }, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # *_initialize() randomly assigns all entities to a group, so we'll have to # unset this assignment for this test unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) assert cxx_s.nentities() == N cxx_s.dcheck_consistency() assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2]) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in it.islice(enumerate(data), 2): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency() newrow = mkrow() newdata = np.array([newrow], dtype=dtype) cxx_score = cxx_s.score_value(newdata[0], R) assert cxx_score is not None cxx_s.dcheck_consistency()
def test_sample_post_pred(): N = 10 R = rng(5483932) D = 4 def randombool(): return np.random.choice([False, True]) def mkrow(): return tuple(randombool() for _ in xrange(D)) dtype = [('', bool)] * D data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb] * D) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 2.0 }, 'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D, 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) G = 3 unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % G cxx_s.add_value(egid, i, yi, R) # sample y_new_data = mkrow() y_new_mask = tuple(randombool() for _ in xrange(D)) y_new = ma.masked_array(np.array([y_new_data], dtype=dtype), mask=[y_new_mask])[0] n_samples = 1000 cxx_samples = np.hstack( [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)]) idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))} def todist(samples): dist = np.zeros(len(idmap)) for s in samples: dist[idmap[tuple(s)]] += 1.0 dist /= dist.sum() return dist cxx_dist = todist(cxx_samples) assert cxx_dist is not None
def test_operations(): N = 10 R = rng(12) def mkrow(): return (np.random.choice([False, True]), np.random.choice([False, True]), np.random.random(), np.random.choice([False, True])) dtype = [('', bool), ('', bool), ('', float), ('', bool)] # non-masked data data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bb, nich, bb]) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # *_initialize() randomly assigns all entities to a group, so we'll have to # unset this assignment for this test unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) assert cxx_s.nentities() == N cxx_s.dcheck_consistency() assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2]) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in it.islice(enumerate(data), 2): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency() newrow = mkrow() newdata = np.array([newrow], dtype=dtype) cxx_score = cxx_s.score_value(newdata[0], R) assert cxx_score is not None cxx_s.dcheck_consistency()
def test_sample_post_pred(): N = 10 R = rng(5483932) D = 4 def randombool(): return np.random.choice([False, True]) def mkrow(): return tuple(randombool() for _ in xrange(D)) dtype = [('', bool)] * D data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb] * D) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D, 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) G = 3 unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % G cxx_s.add_value(egid, i, yi, R) # sample y_new_data = mkrow() y_new_mask = tuple(randombool() for _ in xrange(D)) y_new = ma.masked_array( np.array([y_new_data], dtype=dtype), mask=[y_new_mask])[0] n_samples = 1000 cxx_samples = np.hstack( [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)]) idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))} def todist(samples): dist = np.zeros(len(idmap)) for s in samples: dist[idmap[tuple(s)]] += 1.0 dist /= dist.sum() return dist cxx_dist = todist(cxx_samples) assert cxx_dist is not None
def score_dataset(counts): M, K = counts.shape Y = np.array([(y, ) for y in counts], dtype=[('', np.int, (K, ))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(M, [dm(K)]) prior = {'alphas': [1.] * K} s = cxx_initialize(defn, view, r, feature_hps=[prior], assignment=[0] * M) assert_equals(s.groups(), [0]) return s.score_data(None, None, r)
def test_masked_operations(): N = 10 R = rng(2347785) dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) mask = [mkmask() for _ in xrange(N)] data = ma.masked_array(data, mask=mask) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 10.0 }, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # see comment above unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in enumerate(data): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency()
def score_dataset(counts): M, K = counts.shape Y = np.array([(y,) for y in counts], dtype=[('', np.int, (K,))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(M, [dm(K)]) prior = {'alphas': [1.] * K} s = cxx_initialize( defn, view, r, feature_hps=[prior], assignment=[0] * M) assert_equals(s.groups(), [0]) return s.score_data(None, None, r)
def test_masked_operations(): N = 10 R = rng(2347785) dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) mask = [mkmask() for _ in xrange(N)] data = ma.masked_array(data, mask=mask) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 10.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # see comment above unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in enumerate(data): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency()
def test_dm_cxx(): K = 4 Y = np.array([ ([0, 1, 2, 5], ), ([1, 0, 1, 2], ), ([0, 2, 9, 9], ), ], dtype=[('', np.int, (K, ))]) Y_np = np.vstack(y[0] for y in Y) cxx_view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(K)]) prior = {'alphas': [1.] * K} cxx_s = cxx_initialize(defn, cxx_view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) counts = cxx_s.get_suffstats(0, 0)['counts'] assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
def test_dm_cxx(): K = 4 Y = np.array([ ([0, 1, 2, 5],), ([1, 0, 1, 2],), ([0, 2, 9, 9],), ], dtype=[('', np.int, (K,))]) Y_np = np.vstack(y[0] for y in Y) cxx_view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(K)]) prior = {'alphas': [1.] * K} cxx_s = cxx_initialize( defn, cxx_view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) counts = cxx_s.get_suffstats(0, 0)['counts'] assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
def test_betabin_equiv(): # https://github.com/pymc-devs/pymc/blob/ # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84 def betabin_like(value, alpha, beta, n): return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) + gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) + gammaln(alpha + value) + gammaln(n + beta - value) - gammaln(beta + alpha + n)) # this N refers to the number of trials in the binomial distribution N = 10 # this refers to the dataset size M = 100 # hyperparams of the beta dist alpha, beta = 1., 2. heads = np.random.randint(low=0, high=N + 1, size=M) tails = N - heads data = np.vstack((heads, tails)).T Y = np.array([(y, ) for y in data], dtype=[('', np.int, (2, ))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(2)]) prior = {'alphas': [alpha, beta]} s = cxx_initialize(defn, view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) assert_equals(s.groups(), [0]) def all_indices(N): for i, j in it.product(range(0, N + 1), repeat=2): if (i + j) == N: yield i, j all_data = [(list(ij), ) for ij in all_indices(N)] Y_test = np.array(all_data, dtype=[('', np.int, (2, ))]) # the actual score is simply a betabin using the updated alpha, beta alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0) def model_score(Y_value): _, (score, ) = s.score_value(Y_value, r) return score def test_score(Y_value): score = betabin_like(Y_value[0][0], alpha1, beta1, N) return score model_scores = np.array(map(model_score, Y_test)) test_scores = np.array(map(test_score, Y_test)) assert_almost_equals(np.exp(model_scores).sum(), 1., places=2) assert_almost_equals(np.exp(test_scores).sum(), 1., places=2) assert_almost_equals(np.abs(model_scores - test_scores).max(), 0., places=1)
def test_betabin_equiv(): # https://github.com/pymc-devs/pymc/blob/ # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84 def betabin_like(value, alpha, beta, n): return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) + gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) + gammaln(alpha + value) + gammaln(n + beta - value) - gammaln(beta + alpha + n)) # this N refers to the number of trials in the binomial distribution N = 10 # this refers to the dataset size M = 100 # hyperparams of the beta dist alpha, beta = 1., 2. heads = np.random.randint(low=0, high=N + 1, size=M) tails = N - heads data = np.vstack((heads, tails)).T Y = np.array([(y,) for y in data], dtype=[('', np.int, (2,))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(2)]) prior = {'alphas': [alpha, beta]} s = cxx_initialize( defn, view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) assert_equals(s.groups(), [0]) def all_indices(N): for i, j in it.product(range(0, N + 1), repeat=2): if (i + j) == N: yield i, j all_data = [(list(ij),) for ij in all_indices(N)] Y_test = np.array(all_data, dtype=[('', np.int, (2,))]) # the actual score is simply a betabin using the updated alpha, beta alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0) def model_score(Y_value): _, (score,) = s.score_value(Y_value, r) return score def test_score(Y_value): score = betabin_like(Y_value[0][0], alpha1, beta1, N) return score model_scores = np.array(map(model_score, Y_test)) test_scores = np.array(map(test_score, Y_test)) assert_almost_equals(np.exp(model_scores).sum(), 1., places=2) assert_almost_equals(np.exp(test_scores).sum(), 1., places=2) assert_almost_equals( np.abs(model_scores - test_scores).max(), 0., places=1)