def test_alpha_numeric(): docs = [list('abcd'), list('cdef')] defn = model_definition(len(docs), v=6) prng = rng() s = initialize(defn, docs, prng) assert_equals(s.nentities(), len(docs)) assert_equals(s.nwords(), 6)
def _test_runner_simple(defn, kc_fn): views = map(numpy_dataview, toy_dataset(defn)) kc = kc_fn(defn) prng = rng() latent = model.initialize(defn, views, prng) r = runner.runner(defn, views, latent, kc) r.run(prng, 10)
def test_slice_theta_mm(): N = 100 data = np.array( [(np.random.random() < 0.8,) for _ in xrange(N)], dtype=[('', bool)]) defn = model_definition(N, [bbnc]) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} view = numpy_dataview(data) s = initialize( defn, view, cluster_hp={'alpha': 1., 'beta': 9.}, feature_hps=[prior], r=r, assignment=[0] * N) heads = len([1 for y in data if y[0]]) tails = N - heads alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails bs = bind(s, view) params = {0: {'p': 0.05}} def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, 0)['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def test_posterior_predictive_statistic(): N, D = 10, 4 # D needs to be even defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] q = ma.masked_array( np.array([(False,) * D], dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)]) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge='mode') assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge=['mode', 'mode', 'avg', 'avg']) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) q = ma.masked_array( np.array([(False,) * D] * 3, dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (3,)) assert_equals(len(statistic.dtype), D)
def test_cant_serialize(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) prng = rng() s = initialize(defn, data, prng) s.serialize()
def test_runner_multiprocessing_convergence(): N, D = 4, 5 defn = model_definition(N, [bb] * D) prng = rng() Y, posterior = data_with_posterior(defn, r=prng) view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(mp.cpu_count())] runners = [runner.runner(defn, view, latent, ['assign']) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=1000) # burnin idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_iter(): r.run(r=prng, niters=10) for latent in r.get_latents(): yield idmap[tuple(permutation_canonical(latent.assignments()))] ref = [None] def sample_fn(): if ref[0] is None: ref[0] = sample_iter() try: return next(ref[0]) except StopIteration: ref[0] = None return sample_fn() assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
def test_slice_theta_irm(): N = 10 defn = model_definition([N], [((0, 0), bbnc)]) data = np.random.random(size=(N, N)) < 0.8 view = numpy_dataview(data) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} s = initialize( defn, [view], r=r, cluster_hps=[{'alpha': 2.0}], relation_hps=[prior], domain_assignments=[[0] * N]) bs = bind(s, 0, [view]) params = {0: {'p': 0.05}} heads = len([1 for y in data.flatten() if y]) tails = len([1 for y in data.flatten() if not y]) alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, [0, 0])['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def test_runner_multiprocessing_convergence(): domains = [4] defn = model_definition(domains, [((0, 0), bb)]) prng = rng() relations, posterior = data_with_posterior(defn, prng) views = map(numpy_dataview, relations) latents = [model.initialize(defn, views, prng) for _ in xrange(mp.cpu_count())] kc = [('assign', range(len(domains)))] runners = [runner.runner(defn, views, latent, kc) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=10000) # burnin product_assignments = tuple(map(list, map(permutation_iter, domains))) idmap = {C: i for i, C in enumerate(it.product(*product_assignments))} def sample_iter(): r.run(r=prng, niters=10) for latent in r.get_latents(): key = tuple(tuple(permutation_canonical(latent.assignments(i))) for i in xrange(len(domains))) yield idmap[key] ref = [None] def sample_fn(): if ref[0] is None: ref[0] = sample_iter() try: return next(ref[0]) except StopIteration: ref[0] = None return sample_fn() assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
def test_cxx_sample_post_pred_given_data(): assert D == 5 y_new = ma.masked_array( np.array([(True, False, True, True, True)], dtype=[('', np.bool)] * 5), mask=[(False, False, True, True, True)])[0] _test_sample_post_pred( cxx_initialize, cxx_numpy_dataview, y_new, rng(543234))
def data_with_posterior(defn, cluster_hp=None, feature_hps=None, preprocess_data_fn=None, r=None): # XXX(stephentu): should only accept conjugate models if r is None: r = rng() Y_clusters, _ = sample(defn, cluster_hp, feature_hps, r) Y = np.hstack(Y_clusters) if preprocess_data_fn: Y = preprocess_data_fn(Y) data = numpy_dataview(Y) def score_fn(assignment): s = initialize(defn, data, r, cluster_hp=cluster_hp, feature_hps=feature_hps, assignment=assignment) return s.score_joint(r) posterior = dist_on_all_clusterings(score_fn, defn.n()) return Y, posterior
def test_multivariate_models_cxx(): _test_multivariate_models( initialize, numpy_dataview, bind, gibbs_assign, rng())
def _test_convergence_bb_cxx(N, D, kernel, preprocess_data_fn=None, nonconj=False, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, kl_places=2): r = rng() cluster_hp = {'alpha': 2.0} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D defn = model_definition(N, [bb] * D) nonconj_defn = model_definition(N, [bbnc] * D) Y, posterior = data_with_posterior( defn, cluster_hp, feature_hps, preprocess_data_fn) data = numpy_dataview(Y) s = initialize(nonconj_defn if nonconj else defn, data, cluster_hp=cluster_hp, feature_hps=feature_hps, r=r) bs = bind(s, data) wrapped_kernel = lambda s: kernel(s, r) _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip, ntries, nsamples, kl_places)
def test_explicit_exceptions(): """ValueError should be rasied for bad assignments """ prng = rng() N, V = 3, 7 defn = model_definition(N, V) data = [[0, 1, 2, 3], [0, 1, 4], [0, 1, 5, 6]] # We should get an error if we leave out a dish assignment for a given table table_assignments = [[1, 2, 1, 2], [1, 1, 1], [3, 3, 3, 1]] dish_assignments = [[0, 1, 2], [0, 3], [0, 1, 2]] assert_raises(ValueError, initialize, defn, data, table_assignments=table_assignments, dish_assignments=dish_assignments) # We should get an error if we leave out a table assignment for a given word table_assignments = [[1, 2, 1, 2], [1, 1, 1], [3, 3, 3]] dish_assignments = [[0, 1, 2], [0, 3], [0, 1, 2, 1]] assert_raises(ValueError, initialize, defn, data, table_assignments=table_assignments, dish_assignments=dish_assignments)
def test_dense_vs_sparse(): # XXX: really belongs in irm test cases, but kernels has a nice cluster # enumeration iterator r = rng() n = 5 raw = ma.array( np.random.choice(np.arange(20), size=(n, n)), mask=np.random.choice([False, True], size=(n, n))) dense = [relation_numpy_dataview(raw)] sparse = [sparse_relation_dataview(_tocsr(raw))] domains = [n] relations = [((0, 0), gp)] defn = irm_definition(domains, relations) def score_fn(data): def f(assignments): s = irm_initialize(defn, data, r=r, domain_assignments=assignments) assign = sum(s.score_assignment(i) for i in xrange(len(assignments))) likelihood = s.score_likelihood(r) return assign + likelihood return f product_assignments = tuple(map(list, map(permutation_iter, domains))) dense_posterior = scores_to_probs( np.array(map(score_fn(dense), it.product(*product_assignments)))) sparse_posterior = scores_to_probs( np.array(map(score_fn(sparse), it.product(*product_assignments)))) assert_1d_lists_almost_equals(dense_posterior, sparse_posterior, places=3)
def test_kernel_gibbs_hp(): _test_kernel_gibbs_hp(initialize, numpy_dataview, bind, gibbs_hp, 'grid_gibbs_hp_samples_pdf', rng())
def test_simple(): N, V = 10, 100 defn = model_definition(N, V) data = toy_dataset(defn) view = numpy_dataview(data) R = rng() s = initialize(defn, view, R) assert_equals(s.nentities(), len(data))
def test_simple(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() s = initialize(defn, view, prng) assert_equals(s.nentities(), len(data))
def test_multi_dish_initialization(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() s = initialize(defn, view, prng, initial_dishes=V) assert_true(s.ntopics() > 1)
def test_single_dish_initialization(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() s = initialize(defn, view, prng, initial_dishes=1) assert_equals(s.ntopics(), 0) # Only dummy topic
def test_state_pickle(): defn = model_definition([5], [((0, 0), bb)]) r = rng() relations = toy_dataset(defn) views = map(numpy_dataview, relations) s1 = model.initialize(defn, views, r) s2 = pickle.loads(pickle.dumps(s1)) _assert_structure_equals(defn, s1, s2, views, r)
def test_convergence_simple(): N, V = 2, 10 defn = model_definition(N, V) data = [ np.array([5, 6]), np.array([0, 1, 2]), ] view = numpy_dataview(data) prng = rng() scores = [] idmap = {} for i, (tables, dishes) in enumerate(permutations([2, 3])): latent = model.initialize( defn, view, prng, table_assignments=tables, dish_assignments=dishes) scores.append( latent.score_assignment() + latent.score_data(prng)) idmap[(tables, dishes)] = i true_dist = scores_to_probs(scores) def kernel(latent): # mutates latent in place doc_model = model.bind(latent, data=view) kernels.assign2(doc_model, prng) for did in xrange(latent.nentities()): table_model = model.bind(latent, document=did) kernels.assign(table_model, prng) latent = model.initialize(defn, view, prng) skip = 10 def sample_fn(): for _ in xrange(skip): kernel(latent) table_assignments = latent.table_assignments() canon_table_assigments = tuple( map(tuple, map(permutation_canonical, table_assignments))) dish_maps = latent.dish_assignments() dish_assignments = [] for dm, (ta, ca) in zip(dish_maps, zip(table_assignments, canon_table_assigments)): dish_assignment = [] for t, c in zip(ta, ca): if c == len(dish_assignment): dish_assignment.append(dm[t]) dish_assignments.append(dish_assignment) canon_dish_assigments = tuple( map(tuple, map(permutation_canonical, dish_assignments))) return idmap[(canon_table_assigments, canon_dish_assigments)] assert_discrete_dist_approx( sample_fn, true_dist, ntries=100, nsamples=10000, kl_places=2)
def test_runner_simple(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() latent = model.initialize(defn, view, prng) r = runner.runner(defn, view, latent) r.run(prng, 1)
def test_zmatrix(): N, D = 10, 4 defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] zmat = query.zmatrix(latents) assert_equals(zmat.shape, (N, N))
def _test_convergence(domains, data, reg_relations, brute_relations, kernel, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, places=2): r = rng() reg_defn = irm_definition(domains, reg_relations) brute_defn = irm_definition(domains, brute_relations) def score_fn(assignments): s = irm_initialize( brute_defn, data, r=r, domain_assignments=assignments) assign = sum(s.score_assignment(i) for i in xrange(len(assignments))) likelihood = s.score_likelihood(r) return assign + likelihood product_assignments = tuple(map(list, map(permutation_iter, domains))) posterior = scores_to_probs( np.array(map(score_fn, it.product(*product_assignments)))) s = irm_initialize(reg_defn, data, r=r) bounded_states = [irm_bind(s, i, data) for i in xrange(len(domains))] # burnin start = time.time() last = start for i in xrange(burnin_niters): for bs in bounded_states: kernel(bs, r) if not ((i + 1) % 1000): print 'burning finished iteration', (i + 1), \ 'in', (time.time() - last), 'seconds' last = time.time() print 'finished burnin of', burnin_niters, \ 'iters in', (time.time() - start), 'seconds' idmap = {C: i for i, C in enumerate(it.product(*product_assignments))} #print idmap def sample_fn(): for _ in xrange(skip): for bs in bounded_states: kernel(bs, r) key = tuple(tuple(permutation_canonical(bs.assignments())) for bs in bounded_states) return idmap[key] assert_discrete_dist_approx( sample_fn, posterior, ntries=ntries, nsamples=nsamples, kl_places=places)
def test_runner_specify_basic_kernel(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() latent = model.initialize(defn, view, prng) r = runner.runner(defn, view, latent, ["crf"]) r.run(prng, 1)
def test_serialize_simple(): N, V = 10, 20 defn = model_definition(N, V) data = toy_dataset(defn) view = data prng = rng() s = initialize(defn, view, prng) m = s.serialize() s2 = deserialize(defn, m) assert s2.__class__ == s.__class__
def test_lda_zero_iter(self): # compare to model with 0 iterations prng2 = rng(seed=54321) latent2 = model.initialize(self.defn, self.docs, prng2) assert latent2 is not None r2 = runner.runner(self.defn, self.docs, latent2) assert r2 is not None doc_topic2 = latent2.topic_distribution_by_document() assert doc_topic2 is not None assert latent2.perplexity() > self.latent.perplexity()
def test_operations(): N = 10 R = rng(12) def mkrow(): return (np.random.choice([False, True]), np.random.choice([False, True]), np.random.random(), np.random.choice([False, True])) dtype = [('', bool), ('', bool), ('', float), ('', bool)] # non-masked data data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bb, nich, bb]) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # *_initialize() randomly assigns all entities to a group, so we'll have to # unset this assignment for this test unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) assert cxx_s.nentities() == N cxx_s.dcheck_consistency() assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2]) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in it.islice(enumerate(data), 2): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency() newrow = mkrow() newdata = np.array([newrow], dtype=dtype) cxx_score = cxx_s.score_value(newdata[0], R) assert cxx_score is not None cxx_s.dcheck_consistency()
def test_simple(): domains = [5, 6] relations = [((0, 1), bb)] relsize = (domains[0], domains[1]) raw_data = [ ma.array(np.random.choice([False, True], size=relsize), mask=np.random.choice([False, True], size=relsize)) ] def csr(raw): n, m = raw.shape def indices(): for i, j in it.product(range(n), range(m)): if not raw.mask[i, j]: yield i, j data = [raw[i, j] for i, j in indices()] i = list(map(op.itemgetter(0), indices())) j = list(map(op.itemgetter(1), indices())) return coo_matrix((data, (i, j)), shape=raw.shape).tocsr() defn = model_definition(domains, relations) data = map(numpy_dataview, raw_data) sparse_data = map(sparse_2d_dataview, map(csr, raw_data)) r = rng() s = initialize(defn, data, r=r) assert s and bind(s, 0, data) and bind(s, 1, data) s1 = initialize(defn, sparse_data, r=r) assert s1 and bind(s1, 0, sparse_data) and bind(s1, 1, sparse_data) def entity_data_positions(domain, eid): def f(domains, reln): for pos0 in xrange(reln.shape[0]): for pos1 in xrange(reln.shape[1]): if reln.mask[pos0, pos1]: continue if (domains[0] == domain and pos0 == eid) or (domains[1] == domain and pos1 == eid): yield [pos0, pos1] return list(it.chain.from_iterable(f(domains, reln) for (domains, _), reln in zip(relations, raw_data))) def test(s): for did, nentities in enumerate(domains): for eid in xrange(nentities): a = entity_data_positions(did, eid) b = s.entity_data_positions(did, eid, data) assert sorted(a) == sorted(b) test(s) test(s1)
def test_sample_post_pred(): N = 10 R = rng(5483932) D = 4 def randombool(): return np.random.choice([False, True]) def mkrow(): return tuple(randombool() for _ in xrange(D)) dtype = [('', bool)] * D data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb] * D) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D, 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) G = 3 unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % G cxx_s.add_value(egid, i, yi, R) # sample y_new_data = mkrow() y_new_mask = tuple(randombool() for _ in xrange(D)) y_new = ma.masked_array( np.array([y_new_data], dtype=dtype), mask=[y_new_mask])[0] n_samples = 1000 cxx_samples = np.hstack( [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)]) idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))} def todist(samples): dist = np.zeros(len(idmap)) for s in samples: dist[idmap[tuple(s)]] += 1.0 dist /= dist.sum() return dist cxx_dist = todist(cxx_samples) assert cxx_dist is not None
def bench(args, latent_fn): parser = argparse.ArgumentParser() parser.add_argument('--groups', type=int, action='append') parser.add_argument('--entities-per-group', type=int, action='append') parser.add_argument('--features', type=int, action='append') parser.add_argument('--target-runtime', type=int, required=True) parser.add_argument('--output', type=str, required=True) args = parser.parse_args(args) print args if not args.groups or not len(args.groups): raise ValueError("need to specify >= 1 --groups") if len(args.groups) == 1: print "WARNING: one group will make very uninteresting graphs" for groups in args.groups: if groups <= 0: raise ValueError('need positive groups') if not args.entities_per_group or not len(args.entities_per_group): raise ValueError("need to specify >= 1 --entities-per-group") for entities_per_group in args.entities_per_group: if entities_per_group <= 0: raise ValueError('need positive entities_per_group') if not args.features or not len(args.features): raise ValueError("need to specify >= 1 --features") for features in args.features: if features <= 0: raise ValueError('need positive features') if args.target_runtime <= 0: raise ValueError("--target-runtime needs to be >= 0") vs = versions() vstr = 'c{}-m{}-k{}'.format(vs['common'], vs['mixturemodel'], vs['kernels']) print 'vstr:', vstr r = rng() target_runtime = args.target_runtime results = [] grid = it.product(args.groups, args.entities_per_group, args.features) for groups, entities_per_group, features in grid: start = time.time() latent = latent_fn(groups, entities_per_group, features, r) results.append(measure(groups, target_runtime, latent, r)) print 'finished ({}, {}, {}) in {} seconds'.format( groups, entities_per_group, features, time.time() - start) output = { 'args': args.__dict__, 'versions': vs, 'cpuinfo': cpuinfo.get_cpu_info(), 'results': results, 'time': datetime.now().isoformat(), } with open(args.output, 'w') as fp: json.dump(output, fp)
def test_import_rng(): from microscopes.common.rng import rng r = rng(12345) assert r
def test_betabin_equiv(): # https://github.com/pymc-devs/pymc/blob/ # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84 def betabin_like(value, alpha, beta, n): return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) + gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) + gammaln(alpha + value) + gammaln(n + beta - value) - gammaln(beta + alpha + n)) # this N refers to the number of trials in the binomial distribution N = 10 # this refers to the dataset size M = 100 # hyperparams of the beta dist alpha, beta = 1., 2. heads = np.random.randint(low=0, high=N + 1, size=M) tails = N - heads data = np.vstack((heads, tails)).T Y = np.array([(y, ) for y in data], dtype=[('', np.int, (2, ))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(2)]) prior = {'alphas': [alpha, beta]} s = cxx_initialize(defn, view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) assert_equals(s.groups(), [0]) def all_indices(N): for i, j in it.product(range(0, N + 1), repeat=2): if (i + j) == N: yield i, j all_data = [(list(ij), ) for ij in all_indices(N)] Y_test = np.array(all_data, dtype=[('', np.int, (2, ))]) # the actual score is simply a betabin using the updated alpha, beta alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0) def model_score(Y_value): _, (score, ) = s.score_value(Y_value, r) return score def test_score(Y_value): score = betabin_like(Y_value[0][0], alpha1, beta1, N) return score model_scores = np.array(map(model_score, Y_test)) test_scores = np.array(map(test_score, Y_test)) assert_almost_equals(np.exp(model_scores).sum(), 1., places=2) assert_almost_equals(np.exp(test_scores).sum(), 1., places=2) assert_almost_equals(np.abs(model_scores - test_scores).max(), 0., places=1)
def test_stress_cxx(): _test_stress(cxx_initialize, cxx_numpy_dataview, rng())
from microscopes.common.rng import rng from microscopes.common.relation.dataview import numpy_dataview from microscopes.models import bb as beta_bernoulli from microscopes.irm.definition import model_definition from microscopes.irm import model, runner, query from microscopes.kernels import parallel from microscopes.common.query import groups, zmatrix_heuristic_block_ordering, zmatrix_reorder # ##Let's start by defining the model and loading the data # In[6]: defn = model_definition([N], [((0, 0), beta_bernoulli)]) views = [numpy_dataview(communications_relation)] prng = rng() # ##Next, let's initialize the model and define the runners. # # ##These runners are our MCMC chains. We'll use `cpu_count` to define our number of chains. # In[ ]: nchains = cpu_count() latents = [ model.initialize(defn, views, r=prng, cluster_hps=[{ 'alpha': 1e-3 }]) for _ in xrange(nchains) ] kc = runner.default_assign_kernel_config(defn) runners = [runner.runner(defn, views, latent, kc) for latent in latents]
def infinite_relational_model(corr_matrix, lag_matrix, threshold, sampled_coords, window_size): import numpy as np import math import json import time import itertools as it from multiprocessing import cpu_count from microscopes.common.rng import rng from microscopes.common.relation.dataview import numpy_dataview from microscopes.models import bb as beta_bernoulli from microscopes.irm.definition import model_definition from microscopes.irm import model, runner, query from microscopes.kernels import parallel from microscopes.common.query import groups, zmatrix_heuristic_block_ordering, zmatrix_reorder cluster_matrix = [] graph = [] # calculate graph for row in corr_matrix: graph_row = [] for corr in row: if corr < threshold: graph_row.append(False) else: graph_row.append(True) graph.append(graph_row) graph = np.array(graph, dtype=np.bool) graph_size = len(graph) # conduct Infinite Relational Model defn = model_definition([graph_size], [((0, 0), beta_bernoulli)]) views = [numpy_dataview(graph)] prng = rng() nchains = cpu_count() latents = [model.initialize(defn, views, r=prng, cluster_hps=[{'alpha':1e-3}]) for _ in xrange(nchains)] kc = runner.default_assign_kernel_config(defn) runners = [runner.runner(defn, views, latent, kc) for latent in latents] r = parallel.runner(runners) start = time.time() # r.run(r=prng, niters=1000) # r.run(r=prng, niters=100) r.run(r=prng, niters=20) print ("inference took", time.time() - start, "seconds") infers = r.get_latents() clusters = groups(infers[0].assignments(0), sort=True) ordering = list(it.chain.from_iterable(clusters)) z = graph.copy() z = z[ordering] z = z[:,ordering] corr_matrix = corr_matrix[ordering] corr_matrix = corr_matrix[:,ordering] lag_matrix = lag_matrix[ordering] lag_matrix = lag_matrix[:,ordering] cluster_sampled_coords = np.array(sampled_coords) cluster_sampled_coords = cluster_sampled_coords[ordering] response_msg = { 'corrMatrix': corr_matrix.tolist(), 'lagMatrix': lag_matrix.tolist(), 'clusterMatrix': z.tolist(), 'clusterSampledCoords': cluster_sampled_coords.tolist(), 'nClusterList': [len(cluster) for cluster in clusters], 'ordering': ordering, } f = open("./expdata/clustermatrix-" + str(window_size) + ".json", "w") json.dump(response_msg, f) f.close() return response_msg
def test_mnist(): import matplotlib.pylab as plt from PIL import Image, ImageOps mnist_dataset = _get_mnist_dataset() Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]] Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]] print 'number of twos:', Y_2.shape[0] print 'number of threes:', Y_3.shape[0] _, D = Y_2.shape W = int(math.sqrt(D)) assert W * W == D dtype = [('', bool)] * D Y = np.vstack([Y_2, Y_3]) Y = np.array( [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]], dtype=dtype) view = numpy_dataview(Y) defn = model_definition(Y.shape[0], [bb] * D) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * D, r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D) } def plot_clusters(s, fname, scalebysize=False): hps = [s.get_feature_hp(i) for i in xrange(D)] def prior_prob(hp): return hp['alpha'] / (hp['alpha'] + hp['beta']) def data_for_group(gid): suffstats = [s.get_suffstats(gid, i) for i in xrange(D)] def prob(hp, ss): top = hp['alpha'] + ss['heads'] bot = top + hp['beta'] + ss['tails'] return top / bot probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)] return np.array(probs) def scale(d, weight): im = d.reshape((W, W)) newW = max(int(weight * W), 1) im = Image.fromarray(im) im = im.resize((newW, newW)) im = ImageOps.expand(im, border=(W - newW) / 2) im = np.array(im) a, b = im.shape #print 'a,b:', a, b if a < W: im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0) elif a > W: im = im[:W, :] assert im.shape[0] == W if b < W: #print 'current:', im.shape im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1) elif b > W: im = im[:, :W] assert im.shape[1] == W return im.flatten() data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)] largest = max(cnt for _, cnt in data) data = [ scale(d, cnt / float(largest)) if scalebysize else d for d, cnt in data ] digits_per_row = 12 rem = len(data) % digits_per_row if rem: fill = digits_per_row - rem for _ in xrange(fill): data.append(np.zeros(D)) assert not (len(data) % digits_per_row) #rows = len(data) / digits_per_row data = np.vstack([ np.hstack([d.reshape((W, W)) for d in data[i:i + digits_per_row]]) for i in xrange(0, len(data), digits_per_row) ]) #print 'saving figure', fname plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig(fname) plt.close() def plot_hyperparams(s, fname): hps = [s.get_feature_hp(i) for i in xrange(D)] alphas = np.array([hp['alpha'] for hp in hps]) betas = np.array([hp['beta'] for hp in hps]) data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))]) plt.imshow(data, interpolation='nearest') plt.colorbar() plt.savefig(fname) plt.close() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' return s.score_joint(r) # burnin burnin = 20 for rid in xrange(burnin): print 'score:', kernel(rid) print 'finished burnin' plot_clusters(s, 'mnist_clusters.pdf') plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True) plot_hyperparams(s, 'mnist_hyperparams.pdf') print 'groupcounts:', groupcounts(s) # posterior predictions present = D / 2 absent = D - present queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \ [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]] queries_masked = ma.masked_array(np.array(queries, dtype=[('', bool)] * D), mask=[(False, ) * present + (True, ) * absent]) def postpred_sample(y_new): Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)] Y_samples = np.array([list(y) for y in np.hstack(Y_samples)]) Y_avg = Y_samples.mean(axis=0) return Y_avg queries_masked = [postpred_sample(y) for y in queries_masked] data0 = np.hstack([q.reshape((W, W)) for q in queries_masked]) data1 = np.hstack([ np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W)) for q in queries ]) data = np.vstack([data0, data1]) plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig('mnist_predict.pdf') plt.close()
def _test_convergence(domains, data, reg_relations, brute_relations, kernel, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, places=2): r = rng() reg_defn = irm_definition(domains, reg_relations) brute_defn = irm_definition(domains, brute_relations) def score_fn(assignments): s = irm_initialize(brute_defn, data, r=r, domain_assignments=assignments) assign = sum(s.score_assignment(i) for i in xrange(len(assignments))) likelihood = s.score_likelihood(r) return assign + likelihood product_assignments = tuple(map(list, map(permutation_iter, domains))) posterior = scores_to_probs( np.array(map(score_fn, it.product(*product_assignments)))) s = irm_initialize(reg_defn, data, r=r) bounded_states = [irm_bind(s, i, data) for i in xrange(len(domains))] # burnin start = time.time() last = start for i in xrange(burnin_niters): for bs in bounded_states: kernel(bs, r) if not ((i + 1) % 1000): print 'burning finished iteration', (i + 1), \ 'in', (time.time() - last), 'seconds' last = time.time() print 'finished burnin of', burnin_niters, \ 'iters in', (time.time() - start), 'seconds' idmap = {C: i for i, C in enumerate(it.product(*product_assignments))} #print idmap def sample_fn(): for _ in xrange(skip): for bs in bounded_states: kernel(bs, r) key = tuple( tuple(permutation_canonical(bs.assignments())) for bs in bounded_states) return idmap[key] assert_discrete_dist_approx(sample_fn, posterior, ntries=ntries, nsamples=nsamples, kl_places=places)
def test_gauss_cxx(): import time _test_gauss(cxx_slice_sample, rng(int(time.time())))
def test_convergence_simple(): N, V = 2, 10 defn = model_definition(N, V) data = [ np.array([5, 6]), np.array([0, 1, 2]), ] view = numpy_dataview(data) prng = rng() scores = [] idmap = {} for i, (tables, dishes) in enumerate(permutations([2, 3])): latent = model.initialize(defn, view, prng, table_assignments=tables, dish_assignments=dishes) scores.append(latent.score_assignment() + latent.score_data(prng)) idmap[(tables, dishes)] = i true_dist = scores_to_probs(scores) def kernel(latent): # mutates latent in place doc_model = model.bind(latent, data=view) kernels.assign2(doc_model, prng) for did in xrange(latent.nentities()): table_model = model.bind(latent, document=did) kernels.assign(table_model, prng) latent = model.initialize(defn, view, prng) skip = 10 def sample_fn(): for _ in xrange(skip): kernel(latent) table_assignments = latent.table_assignments() canon_table_assigments = tuple( map(tuple, map(permutation_canonical, table_assignments))) dish_maps = latent.dish_assignments() dish_assignments = [] for dm, (ta, ca) in zip(dish_maps, zip(table_assignments, canon_table_assigments)): dish_assignment = [] for t, c in zip(ta, ca): if c == len(dish_assignment): dish_assignment.append(dm[t]) dish_assignments.append(dish_assignment) canon_dish_assigments = tuple( map(tuple, map(permutation_canonical, dish_assignments))) return idmap[(canon_table_assigments, canon_dish_assigments)] assert_discrete_dist_approx(sample_fn, true_dist, ntries=100, nsamples=10000, kl_places=2)
def test_multivariate_models_cxx(): _test_multivariate_models(initialize, numpy_dataview, bind, gibbs_assign, rng())
def test_mnist_supervised(): mnist_dataset = _get_mnist_dataset() classes = range(10) classmap = {c: i for i, c in enumerate(classes)} train_data, test_data = [], [] for c in classes: Y = mnist_dataset['data'][np.where( mnist_dataset['target'] == float(c))[0]] Y_train, Y_test = train_test_split(Y, test_size=0.01) train_data.append(Y_train) test_data.append(Y_test) sample_size_max = 10000 def mk_class_data(c, Y): n, D = Y.shape print 'number of digit', c, 'in training is', n dtype = [('', bool)] * D + [('', int)] inds = np.random.permutation(Y.shape[0])[:sample_size_max] Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], dtype=dtype) return Y Y_train = np.hstack( [mk_class_data(c, y) for c, y in zip(classes, train_data)]) Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] n, = Y_train.shape D = len(Y_train.dtype) print 'training data is', n, 'examples' print 'image dimension is', (D - 1), 'pixels' view = numpy_dataview(Y_train) defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * (D - 1) + [{ 'alphas': [1. for _ in classes] }], r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D - 1) } hparams[D - 1] = { 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) for idx in xrange(len(classes)) } def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0, )], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False, ) * (D - 1) + (True, )])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30) ] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc #import matplotlib.pylab as plt #Y_prob = np.array([c for _, _, c in results]) #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) #plt.plot(fpr, tpr) #plt.show() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' # print group size breakdown sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] sizes = sorted(sizes, key=lambda x: x[1], reverse=True) print ' group_sizes=', sizes print_prediction_results() # save state mkdirp("mnist-states") fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) with open(fname, "w") as fp: fp.write(s.serialize()) # training iters = 30 for rid in xrange(iters): kernel(rid)
def test_compare_to_mixture_model(): r = rng() N, D = 4, 5 Y = np.random.uniform(size=(N, D)) > 0.8 Y_rec = np.array([tuple(y) for y in Y], dtype=[('', bool)] * D) mm_view = rec_numpy_dataview(Y_rec) irm_view = relation_numpy_dataview(Y) mm_def = mm_definition(N, [bb] * D) irm_def = irm_definition([N, D], [((0, 1), bb)]) perms = list(permutation_iter(N)) assignment = perms[np.random.randint(0, len(perms))] mm_s = mm_initialize(mm_def, mm_view, r=r, assignment=assignment) irm_s = irm_initialize(irm_def, [irm_view], r=r, domain_assignments=[ assignment, range(D), ]) def assert_suff_stats_equal(): assert set(mm_s.groups()) == set(irm_s.groups(0)) assert irm_s.groups(1) == range(D) groups = mm_s.groups() for g in groups: for i in xrange(D): a = mm_s.get_suffstats(g, i) b = irm_s.get_suffstats(0, [g, i]) if b is None: b = {'heads': 0L, 'tails': 0L} assert a['heads'] == b['heads'] and a['tails'] == b['tails'] assert_suff_stats_equal() assert_almost_equals(mm_s.score_assignment(), irm_s.score_assignment(0), places=3) bound_mm_s = mm_bind(mm_s, mm_view) bound_irm_s = irm_bind(irm_s, 0, [irm_view]) # XXX: doesn't really have to be true, just is true of impl assert not bound_mm_s.empty_groups() assert not bound_irm_s.empty_groups() bound_mm_s.create_group(r) bound_irm_s.create_group(r) gid_a = bound_mm_s.remove_value(0, r) gid_b = bound_irm_s.remove_value(0, r) assert gid_a == gid_b assert_suff_stats_equal() x0, y0 = bound_mm_s.score_value(0, r) x1, y1 = bound_irm_s.score_value(0, r) assert x0 == x1 # XXX: not really a requirement # XXX: should really normalize and then check for a, b in zip(y0, y1): assert_almost_equals(a, b, places=2)
def test_crp(): for alpha in (0.1, 1.0, 10.0): _test_crp(initialize, numpy_dataview, alpha=alpha, r=rng())