def _test_convergence_bb_cxx(N, D, kernel, preprocess_data_fn=None, nonconj=False, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, kl_places=2): r = rng() cluster_hp = {'alpha': 2.0} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D defn = model_definition(N, [bb] * D) nonconj_defn = model_definition(N, [bbnc] * D) Y, posterior = data_with_posterior( defn, cluster_hp, feature_hps, preprocess_data_fn) data = numpy_dataview(Y) s = initialize(nonconj_defn if nonconj else defn, data, cluster_hp=cluster_hp, feature_hps=feature_hps, r=r) bs = bind(s, data) wrapped_kernel = lambda s: kernel(s, r) _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip, ntries, nsamples, kl_places)
def _test_convergence_bb_cxx(N, D, kernel, preprocess_data_fn=None, nonconj=False, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, kl_places=2): r = rng() cluster_hp = {'alpha': 2.0} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D defn = model_definition(N, [bb] * D) nonconj_defn = model_definition(N, [bbnc] * D) Y, posterior = data_with_posterior(defn, cluster_hp, feature_hps, preprocess_data_fn) data = numpy_dataview(Y) s = initialize(nonconj_defn if nonconj else defn, data, cluster_hp=cluster_hp, feature_hps=feature_hps, r=r) bs = bind(s, data) wrapped_kernel = lambda s: kernel(s, r) _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip, ntries, nsamples, kl_places)
def test_get_set_params(): defn = model_definition(1, [bb, bnb, gp, nich]) data = np.array([ (True, 3, 5, 10.), ], dtype=[('', bool), ('', int), ('', int), ('', float)]) s = initialize(defn=defn, data=numpy_dataview(data), r=rng()) s.set_cluster_hp({'alpha': 3.0}) assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0}) hyperparams = [ { 'alpha': 1.2, 'beta': 4.3 }, { 'alpha': 1., 'beta': 1., 'r': 1 }, { 'alpha': 1., 'inv_beta': 1. }, { 'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1. }, ] for i, hp in enumerate(hyperparams): s.set_feature_hp(i, hp) assert_dict_almost_equals(s.get_feature_hp(i), hp)
def _make_one_feature_bb_mm(initialize_fn, dataview, Nk, K, alpha, beta, r): # XXX: the rng parameter passed does not get threaded through the # random *data* generation # use the py_bb for sampling py_bb = bb.py_desc()._model_module shared = py_bb.Shared() shared.load({'alpha': alpha, 'beta': beta}) def init_sampler(): samp = py_bb.Sampler() samp.init(shared) return samp samplers = [init_sampler() for _ in xrange(K)] def gen_cluster(samp): data = [(samp.eval(shared),) for _ in xrange(Nk)] return np.array(data, dtype=[('', bool)]) Y_clustered = tuple(map(gen_cluster, samplers)) Y, assignment = data_with_assignment(Y_clustered) view = dataview(Y) s = initialize_fn(model_definition(Y.shape[0], [bb]), view, cluster_hp={'alpha': 2.}, feature_hps=[{'alpha': alpha, 'beta': beta}], r=r, assignment=assignment) return s, view
def test_runner_multiprocessing_convergence(): N, D = 4, 5 defn = model_definition(N, [bb] * D) prng = rng() Y, posterior = data_with_posterior(defn, r=prng) view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(mp.cpu_count())] runners = [runner.runner(defn, view, latent, ['assign']) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=1000) # burnin idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_iter(): r.run(r=prng, niters=10) for latent in r.get_latents(): yield idmap[tuple(permutation_canonical(latent.assignments()))] ref = [None] def sample_fn(): if ref[0] is None: ref[0] = sample_iter() try: return next(ref[0]) except StopIteration: ref[0] = None return sample_fn() assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
def test_posterior_predictive_statistic(): N, D = 10, 4 # D needs to be even defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] q = ma.masked_array( np.array([(False,) * D], dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)]) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge='mode') assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge=['mode', 'mode', 'avg', 'avg']) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) q = ma.masked_array( np.array([(False,) * D] * 3, dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (3,)) assert_equals(len(statistic.dtype), D)
def test_slice_theta_mm(): N = 100 data = np.array( [(np.random.random() < 0.8,) for _ in xrange(N)], dtype=[('', bool)]) defn = model_definition(N, [bbnc]) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} view = numpy_dataview(data) s = initialize( defn, view, cluster_hp={'alpha': 1., 'beta': 9.}, feature_hps=[prior], r=r, assignment=[0] * N) heads = len([1 for y in data if y[0]]) tails = N - heads alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails bs = bind(s, view) params = {0: {'p': 0.05}} def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, 0)['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def _test_stress(initialize_fn, dataview, R): N = 20 D = 2 data = np.random.random(size=(N, D)) < 0.8 Y = np.array([tuple(y) for y in data], dtype=[('', bool)] * D) view = dataview(Y) defn = model_definition(N, [bb] * D) s = initialize_fn(defn, view, cluster_hp={'alpha': 2.0}, r=R) CHANGE_GROUP = 1 CHANGE_VALUE = 2 nops = 100 while nops: assert len(s.groups()) >= 1 choice = np.random.choice([CHANGE_GROUP, CHANGE_VALUE]) if choice == CHANGE_GROUP: # remove any empty groups. otherwise, add a new group egroups = s.empty_groups() if len(egroups) > 1: s.delete_group(egroups[0]) else: s.create_group(R) else: eid = np.random.randint(N) if s.assignments()[eid] == -1: # add to random group egid = np.random.choice(s.groups()) s.add_value(egid, eid, Y[eid], R) else: s.remove_value(eid, Y[eid], R) s.dcheck_consistency() nops -= 1
def test_crp_empirical(): N = 4 alpha = 2.5 defn = model_definition(N, [bb]) Y = np.array([(True, )] * N, dtype=[('', bool)]) view = numpy_dataview(Y) r = rng() def crp_score(assignment): latent = initialize(defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment() scores = np.array(list(map(crp_score, permutation_iter(N)))) dist = scores_to_probs(scores) idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_fn(): sample = permutation_canonical(_sample_crp(N, alpha)) return idmap[tuple(sample)] assert_discrete_dist_approx(sample_fn, dist, ntries=100)
def test_slice_theta_mm(): N = 100 data = np.array([(np.random.random() < 0.8, ) for _ in xrange(N)], dtype=[('', bool)]) defn = model_definition(N, [bbnc]) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} view = numpy_dataview(data) s = initialize(defn, view, cluster_hp={ 'alpha': 1., 'beta': 9. }, feature_hps=[prior], r=r, assignment=[0] * N) heads = len([1 for y in data if y[0]]) tails = N - heads alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails bs = bind(s, view) params = {0: {'p': 0.05}} def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, 0)['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def _make_one_feature_bb_mm(initialize_fn, dataview, Nk, K, alpha, beta, r): # XXX: the rng parameter passed does not get threaded through the # random *data* generation # use the py_bb for sampling py_bb = bb.py_desc()._model_module shared = py_bb.Shared() shared.load({'alpha': alpha, 'beta': beta}) def init_sampler(): samp = py_bb.Sampler() samp.init(shared) return samp samplers = [init_sampler() for _ in xrange(K)] def gen_cluster(samp): data = [(samp.eval(shared), ) for _ in xrange(Nk)] return np.array(data, dtype=[('', bool)]) Y_clustered = tuple(map(gen_cluster, samplers)) Y, assignment = data_with_assignment(Y_clustered) view = dataview(Y) s = initialize_fn(model_definition(Y.shape[0], [bb]), view, cluster_hp={'alpha': 2.}, feature_hps=[{ 'alpha': alpha, 'beta': beta }], r=r, assignment=assignment) return s, view
def test_operations(): N = 10 R = rng(12) def mkrow(): return (np.random.choice([False, True]), np.random.choice([False, True]), np.random.random(), np.random.choice([False, True])) dtype = [('', bool), ('', bool), ('', float), ('', bool)] # non-masked data data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bb, nich, bb]) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 2.0 }, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # *_initialize() randomly assigns all entities to a group, so we'll have to # unset this assignment for this test unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) assert cxx_s.nentities() == N cxx_s.dcheck_consistency() assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2]) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in it.islice(enumerate(data), 2): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency() newrow = mkrow() newdata = np.array([newrow], dtype=dtype) cxx_score = cxx_s.score_value(newdata[0], R) assert cxx_score is not None cxx_s.dcheck_consistency()
def test_model_definition_pickle(): defn = model_definition(10, [bb, niw(3)]) bstr = pickle.dumps(defn) defn1 = pickle.loads(bstr) assert_equals(defn.n(), defn1.n()) assert_equals(len(defn.models()), len(defn1.models())) for a, b in zip(defn.models(), defn1.models()): assert_equals(a.name(), b.name())
def test_model_definition_copy(): defn = model_definition(10, [bb, niw(3)]) defn_shallow = copy.copy(defn) defn_deep = copy.deepcopy(defn) assert_is_not(defn, defn_shallow) assert_is_not(defn, defn_deep) assert_is_not(defn._models, defn_deep._models) assert_equals(defn.n(), defn_shallow.n()) assert_equals(defn.n(), defn_deep.n())
def test_sample_sanity(): # just a sanity check defn = model_definition(10, [bb, bnb, gp, nich, dd(5), niw(4)]) clusters, samplers = sample(defn) assert_equals(len(clusters), len(samplers)) for cluster in clusters: assert_true(len(cluster) > 0) for v in cluster: assert_equals(len(v), len(defn.models()))
def test_zmatrix(): N, D = 10, 4 defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] zmat = query.zmatrix(latents) assert_equals(zmat.shape, (N, N))
def test_sample_post_pred(): N = 10 R = rng(5483932) D = 4 def randombool(): return np.random.choice([False, True]) def mkrow(): return tuple(randombool() for _ in xrange(D)) dtype = [('', bool)] * D data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb] * D) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 2.0 }, 'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D, 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) G = 3 unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % G cxx_s.add_value(egid, i, yi, R) # sample y_new_data = mkrow() y_new_mask = tuple(randombool() for _ in xrange(D)) y_new = ma.masked_array(np.array([y_new_data], dtype=dtype), mask=[y_new_mask])[0] n_samples = 1000 cxx_samples = np.hstack( [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)]) idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))} def todist(samples): dist = np.zeros(len(idmap)) for s in samples: dist[idmap[tuple(s)]] += 1.0 dist /= dist.sum() return dist cxx_dist = todist(cxx_samples) assert cxx_dist is not None
def _test_cluster_hp_inference(initialize_fn, prior_fn, grid_min, grid_max, grid_n, dataview, bind_fn, init_inf_kernel_state_fn, inf_kernel_fn, map_actual_postprocess_fn, prng, burnin=1000, nsamples=1000, skip=10, trials=100, places=2): print '_test_cluster_hp_inference: burnin', burnin, 'nsamples', nsamples, \ 'skip', skip, 'trials', trials, 'places', places N = 1000 D = 5 # create random binary data, doesn't really matter what the values are Y = np.random.random(size=(N, D)) < 0.5 Y = np.array([tuple(y) for y in Y], dtype=[('', np.bool)] * D) view = dataview(Y) defn = model_definition(N, [bb] * D) latent = initialize_fn(defn, view, r=prng) model = bind_fn(latent, view) def score_alpha(alpha): prev_alpha = latent.get_cluster_hp()['alpha'] latent.set_cluster_hp({'alpha': alpha}) score = prior_fn(alpha) + latent.score_assignment() latent.set_cluster_hp({'alpha': prev_alpha}) return score def sample_fn(): for _ in xrange(skip - 1): inf_kernel_fn(model, opaque, prng) inf_kernel_fn(model, opaque, prng) return latent.get_cluster_hp()['alpha'] alpha0 = np.random.uniform(grid_min, grid_max) print 'start alpha:', alpha0 latent.set_cluster_hp({'alpha': alpha0}) opaque = init_inf_kernel_state_fn(latent) for _ in xrange(burnin): inf_kernel_fn(model, opaque, prng) print 'finished burnin of', burnin, 'iterations' print 'grid_min', grid_min, 'grid_max', grid_max assert_1d_cont_dist_approx_emp(sample_fn, score_alpha, grid_min, grid_max, grid_n, trials, nsamples, places)
def test_operations(): N = 10 R = rng(12) def mkrow(): return (np.random.choice([False, True]), np.random.choice([False, True]), np.random.random(), np.random.choice([False, True])) dtype = [('', bool), ('', bool), ('', float), ('', bool)] # non-masked data data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bb, nich, bb]) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], dist_bb.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # *_initialize() randomly assigns all entities to a group, so we'll have to # unset this assignment for this test unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) assert cxx_s.nentities() == N cxx_s.dcheck_consistency() assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2]) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in it.islice(enumerate(data), 2): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency() newrow = mkrow() newdata = np.array([newrow], dtype=dtype) cxx_score = cxx_s.score_value(newdata[0], R) assert cxx_score is not None cxx_s.dcheck_consistency()
def _test_sample_post_pred(initialize_fn, dataview, y_new, r): defn = model_definition(N, [bb] * D) data = [tuple(row) for row in (np.random.random(size=(N, D)) < 0.8)] data = np.array(data, dtype=[('', bool)] * D) s = initialize_fn(defn=defn, data=dataview(data), cluster_hp={'alpha': 2.}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * D, r=r) n_samples = 10000 Y_samples = [s.sample_post_pred(None, r)[1] for _ in xrange(n_samples)] Y_samples = np.hstack(Y_samples) empty_groups = list(s.empty_groups()) if len(empty_groups): for egid in empty_groups[1:]: s.delete_group(egid) else: s.create_group(r) assert len(s.empty_groups()) == 1 def score_post_pred(y): # XXX: the C++ API can only handle structural arrays for now y = np.array([y], dtype=[('', bool)] * D)[0] _, scores = s.score_value(y, r) return logsumexp(scores) scores = np.array( list(map(score_post_pred, it.product([False, True], repeat=D)))) scores = np.exp(scores) assert_almost_equals(scores.sum(), 1.0, places=3) # lazy man idmap = {y: i for i, y in enumerate(it.product([False, True], repeat=D))} smoothing = 1e-5 sample_hist = np.zeros(len(idmap), dtype=np.int) for y in Y_samples: sample_hist[idmap[tuple(y)]] += 1. sample_hist = np.array(sample_hist, dtype=np.float) + smoothing sample_hist /= sample_hist.sum() #print 'actual', scores #print 'emp', sample_hist kldiv = KL_discrete(scores, sample_hist) print 'KL:', kldiv assert kldiv <= 0.005
def test_sample_post_pred(): N = 10 R = rng(5483932) D = 4 def randombool(): return np.random.choice([False, True]) def mkrow(): return tuple(randombool() for _ in xrange(D)) dtype = [('', bool)] * D data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb] * D) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 2.0}, 'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D, 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) G = 3 unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % G cxx_s.add_value(egid, i, yi, R) # sample y_new_data = mkrow() y_new_mask = tuple(randombool() for _ in xrange(D)) y_new = ma.masked_array( np.array([y_new_data], dtype=dtype), mask=[y_new_mask])[0] n_samples = 1000 cxx_samples = np.hstack( [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)]) idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))} def todist(samples): dist = np.zeros(len(idmap)) for s in samples: dist[idmap[tuple(s)]] += 1.0 dist /= dist.sum() return dist cxx_dist = todist(cxx_samples) assert cxx_dist is not None
def test_runner_multyvac(): defn = model_definition(10, [bb, nich, niw(3)]) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = runner.default_kernel_config(defn) prng = rng() latents = [model.initialize(defn, view, prng) for _ in xrange(2)] runners = [runner.runner(defn, view, latent, kc) for latent in latents] r = parallel.runner(runners, backend='multyvac', layer='perf', core='f2') r.run(r=prng, niters=1000) r.run(r=prng, niters=1000)
def _test_sample_post_pred(initialize_fn, dataview, y_new, r): defn = model_definition(N, [bb] * D) data = [tuple(row) for row in (np.random.random(size=(N, D)) < 0.8)] data = np.array(data, dtype=[('', bool)] * D) s = initialize_fn( defn=defn, data=dataview(data), cluster_hp={'alpha': 2.}, feature_hps=[{'alpha': 1., 'beta': 1.}] * D, r=r) n_samples = 10000 Y_samples = [s.sample_post_pred(None, r)[1] for _ in xrange(n_samples)] Y_samples = np.hstack(Y_samples) empty_groups = list(s.empty_groups()) if len(empty_groups): for egid in empty_groups[1:]: s.delete_group(egid) else: s.create_group(r) assert len(s.empty_groups()) == 1 def score_post_pred(y): # XXX: the C++ API can only handle structural arrays for now y = np.array([y], dtype=[('', bool)] * D)[0] _, scores = s.score_value(y, r) return logsumexp(scores) scores = np.array( list(map(score_post_pred, it.product([False, True], repeat=D)))) scores = np.exp(scores) assert_almost_equals(scores.sum(), 1.0, places=3) # lazy man idmap = {y: i for i, y in enumerate(it.product([False, True], repeat=D))} smoothing = 1e-5 sample_hist = np.zeros(len(idmap), dtype=np.int) for y in Y_samples: sample_hist[idmap[tuple(y)]] += 1. sample_hist = np.array(sample_hist, dtype=np.float) + smoothing sample_hist /= sample_hist.sum() #print 'actual', scores #print 'emp', sample_hist kldiv = KL_discrete(scores, sample_hist) print 'KL:', kldiv assert kldiv <= 0.005
def _test_scalar_hp_inference(view, prior_fn, w, grid_min, grid_max, grid_n, likelihood_model, scalar_hp_key, burnin=1000, nsamples=1000, every=10, trials=100, places=2): """ view must be 1D """ r = rng() hparams = {0: {scalar_hp_key: (prior_fn, w)}} def score_fn(scalar): d = latent.get_feature_hp(0) prev_scalar = d[scalar_hp_key] d[scalar_hp_key] = scalar latent.set_feature_hp(0, d) score = prior_fn(scalar) + latent.score_data(0, None, r) d[scalar_hp_key] = prev_scalar latent.set_feature_hp(0, d) return score defn = model_definition(len(view), [likelihood_model]) latent = initialize(defn, view, r=r) model = bind(latent, view) def sample_fn(): for _ in xrange(every): slice_hp(model, r, hparams=hparams) return latent.get_feature_hp(0)[scalar_hp_key] for _ in xrange(burnin): slice_hp(model, r, hparams=hparams) print 'finished burnin of', burnin, 'iterations' print 'grid_min', grid_min, 'grid_max', grid_max assert_1d_cont_dist_approx_emp(sample_fn, score_fn, grid_min, grid_max, grid_n, trials, nsamples, places)
def test_runner_multiprocessing(): defn = model_definition(10, [bb, nich, niw(3)]) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = runner.default_kernel_config(defn) prng = rng() latents = [model.initialize(defn, view, prng) for _ in xrange(mp.cpu_count())] runners = [runner.runner(defn, view, latent, kc) for latent in latents] r = parallel.runner(runners) # check it is restartable r.run(r=prng, niters=10) r.run(r=prng, niters=10)
def run_dpgmm(niter=1000, datadir="../../", nfeatures=13): ranking = [10, 6, 7, 26, 5, 8, 4, 19, 12, 23, 24, 33, 28, 25, 14, 3, 0, 1, 21, 30, 11, 31, 13, 9, 22, 2, 27, 29, 32, 17, 18, 20, 16, 15] features, labels, lc, hr, tstart, \ features_lb, labels_lb, lc_lb, hr_lb, \ fscaled, fscaled_lb, fscaled_full, labels_all = \ load_data(datadir, tseg=1024.0, log_features=None, ranking=ranking) labels_phys = feature_engineering.convert_labels_to_physical(labels) labels_phys_lb = feature_engineering.convert_labels_to_physical(labels_lb) labels_all_phys = np.hstack([labels_phys["train"], labels_phys["val"], labels_phys["test"]]) fscaled_small = fscaled_full[:, :13] nchains = 8 # The random state object prng = rng() # Define a DP-GMM where the Gaussian is 2D defn = model_definition(fscaled_small.shape[0], [normal_inverse_wishart(fscaled_small.shape[1])]) fscaled_rec = np.array([(list(f),) for f in fscaled_small], dtype=[('', np.float32, fscaled_small.shape[1])]) # Create a wrapper around the numpy recarray which # data-microscopes understands view = numpy_dataview(fscaled_rec) # Initialize nchains start points randomly in the state space latents = [model.initialize(defn, view, prng) for _ in xrange(nchains)] # Create a runner for each chain runners = [runner.runner(defn, view, latent, kernel_config=['assign']) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=niter) with open(datadir+"grs1915_dpgmm.pkl", "w") as f: pickle.dump(r, f) return
def score_dataset(counts): M, K = counts.shape Y = np.array([(y, ) for y in counts], dtype=[('', np.int, (K, ))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(M, [dm(K)]) prior = {'alphas': [1.] * K} s = cxx_initialize(defn, view, r, feature_hps=[prior], assignment=[0] * M) assert_equals(s.groups(), [0]) return s.score_data(None, None, r)
def _test_crp(initialize_fn, dataview, alpha, r): N = 6 defn = model_definition(N, [bb]) Y = np.array([(True,)] * N, dtype=[('', bool)]) view = dataview(Y) def crp_score(assignment): latent = initialize_fn( defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment() dist = np.array(list(map(crp_score, permutation_iter(N)))) dist = np.exp(dist) assert_almost_equals(dist.sum(), 1.0, places=3)
def test_masked_operations(): N = 10 R = rng(2347785) dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) mask = [mkmask() for _ in xrange(N)] data = ma.masked_array(data, mask=mask) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'cluster_hp': { 'alpha': 10.0 }, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # see comment above unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in enumerate(data): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency()
def score_dataset(counts): M, K = counts.shape Y = np.array([(y,) for y in counts], dtype=[('', np.int, (K,))]) view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(M, [dm(K)]) prior = {'alphas': [1.] * K} s = cxx_initialize( defn, view, r, feature_hps=[prior], assignment=[0] * M) assert_equals(s.groups(), [0]) return s.score_data(None, None, r)
def test_get_set_params(): defn = model_definition(1, [bb, bnb, gp, nich]) data = np.array([(True, 3, 5, 10.), ], dtype=[('', bool), ('', int), ('', int), ('', float)]) s = initialize(defn=defn, data=numpy_dataview(data), r=rng()) s.set_cluster_hp({'alpha': 3.0}) assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0}) hyperparams = [ {'alpha': 1.2, 'beta': 4.3}, {'alpha': 1., 'beta': 1., 'r': 1}, {'alpha': 1., 'inv_beta': 1.}, {'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1.}, ] for i, hp in enumerate(hyperparams): s.set_feature_hp(i, hp) assert_dict_almost_equals(s.get_feature_hp(i), hp)
def _test_crp(initialize_fn, dataview, alpha, r): N = 6 defn = model_definition(N, [bb]) Y = np.array([(True, )] * N, dtype=[('', bool)]) view = dataview(Y) def crp_score(assignment): latent = initialize_fn(defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment() dist = np.array(list(map(crp_score, permutation_iter(N)))) dist = np.exp(dist) assert_almost_equals(dist.sum(), 1.0, places=3)
def latent(groups, entities_per_group, features, r): N = groups * entities_per_group defn = model_definition(N, [bb] * features) # generate fake data Y = np.random.random(size=(N, features)) <= 0.5 view = numpy_dataview( np.array([tuple(y) for y in Y], dtype=[('', bool)] * features)) # assign entities to their respective groups assignment = [[g] * entities_per_group for g in xrange(groups)] assignment = list(it.chain.from_iterable(assignment)) latent = bind(initialize(defn, view, r, assignment=assignment), view) latent.create_group(r) # perftest() doesnt modify group assignments return latent
def test_runner_convergence(): N, D = 4, 5 defn = model_definition(N, [bb] * D) prng = rng() Y, posterior = data_with_posterior(defn, r=prng) view = numpy_dataview(Y) latent = model.initialize(defn, view, prng) r = runner.runner(defn, view, latent, ['assign']) r.run(r=prng, niters=1000) # burnin idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_fn(): r.run(r=prng, niters=10) new_latent = r.get_latent() return idmap[tuple(permutation_canonical(new_latent.assignments()))] assert_discrete_dist_approx(sample_fn, posterior, ntries=100)
def _test_serializer(initialize_fn, deserialize_fn, dataview): N = 10 R = rng() dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'data': dataview(data), 'cluster_hp': { 'alpha': 10.0 }, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } state = initialize_fn(**init_args) raw = state.serialize() state1 = deserialize_fn(defn, raw) assert state1 is not None bstr = pickle.dumps(state) state2 = pickle.loads(bstr) assert state2 is not None
def test_masked_operations(): N = 10 R = rng(2347785) dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) mask = [mkmask() for _ in xrange(N)] data = ma.masked_array(data, mask=mask) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'cluster_hp': {'alpha': 10.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args) # see comment above unset(cxx_s, data, R) ensure_k_groups(cxx_s, 3, R) for i, yi in enumerate(data): egid = i % 2 cxx_s.add_value(egid, i, yi, R) cxx_s.dcheck_consistency() for i, yi in enumerate(data): cxx_s.remove_value(i, yi, R) cxx_s.dcheck_consistency()
def test_posterior_predictive(): N, D = 10, 4 # D needs to be even defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] q = ma.masked_array( np.array([(False,) * D], dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)]) samples = query.posterior_predictive(q, latents, prng) assert_equals(samples.shape, (1, len(latents))) q = ma.masked_array( np.array([(False,) * D] * 3, dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3) samples = query.posterior_predictive(q, latents, prng) assert_equals(samples.shape, (3, len(latents)))
def test_crp_empirical(): N = 4 alpha = 2.5 defn = model_definition(N, [bb]) Y = np.array([(True,)] * N, dtype=[('', bool)]) view = numpy_dataview(Y) r = rng() def crp_score(assignment): latent = initialize( defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment() scores = np.array(list(map(crp_score, permutation_iter(N)))) dist = scores_to_probs(scores) idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_fn(): sample = permutation_canonical(_sample_crp(N, alpha)) return idmap[tuple(sample)] assert_discrete_dist_approx(sample_fn, dist, ntries=100)
def test_dm_cxx(): K = 4 Y = np.array([ ([0, 1, 2, 5],), ([1, 0, 1, 2],), ([0, 2, 9, 9],), ], dtype=[('', np.int, (K,))]) Y_np = np.vstack(y[0] for y in Y) cxx_view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(K)]) prior = {'alphas': [1.] * K} cxx_s = cxx_initialize( defn, cxx_view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) counts = cxx_s.get_suffstats(0, 0)['counts'] assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
def _test_serializer(initialize_fn, deserialize_fn, dataview): N = 10 R = rng() dtype = [('', bool), ('', int), ('', float)] def randombool(): return np.random.choice([False, True]) def mkrow(): return (randombool(), np.random.randint(1, 10), np.random.random()) def mkmask(): return (randombool(), randombool(), randombool()) data = [mkrow() for _ in xrange(N)] data = np.array(data, dtype=dtype) defn = model_definition(N, [bb, bnb, nich]) init_args = { 'defn': defn, 'data': dataview(data), 'cluster_hp': {'alpha': 10.0}, 'feature_hps': [ dist_bb.EXAMPLES[0]['shared'], dist_bnb.EXAMPLES[0]['shared'], dist_nich.EXAMPLES[0]['shared'], ], 'r': R, } state = initialize_fn(**init_args) raw = state.serialize() state1 = deserialize_fn(defn, raw) assert state1 is not None bstr = pickle.dumps(state) state2 = pickle.loads(bstr) assert state2 is not None
def test_dm_cxx(): K = 4 Y = np.array([ ([0, 1, 2, 5], ), ([1, 0, 1, 2], ), ([0, 2, 9, 9], ), ], dtype=[('', np.int, (K, ))]) Y_np = np.vstack(y[0] for y in Y) cxx_view = cxx_numpy_dataview(Y) r = rng() defn = model_definition(Y.shape[0], [dm(K)]) prior = {'alphas': [1.] * K} cxx_s = cxx_initialize(defn, cxx_view, r, feature_hps=[prior], assignment=[0] * Y.shape[0]) counts = cxx_s.get_suffstats(0, 0)['counts'] assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
def _test_runner_kernel_config(kc_fn, models): defn = model_definition(10, models) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = kc_fn(defn) prng = rng() ntries = 5 while ntries: latent = model.initialize(defn, view, prng) assignments = latent.assignments() r = runner.runner(defn, view, latent, kc) r.run(r=prng, niters=10) assignments1 = r.get_latent().assignments() # XXX: it should be very unlikely the assignments are all equal if assignments == assignments1: ntries -= 1 else: return # success assert_true(False) # exceeded ntries
def _test_multivariate_models(initialize_fn, dataview, bind, gibbs_assign, R): # XXX: this test only checks that the operations don't crash mu = np.ones(3) kappa = 0.3 Q = random_orthonormal_matrix(3) psi = np.dot(Q, np.dot(np.diag([1.0, 0.5, 0.2]), Q.T)) nu = 6 N = 10 def genrow(): return tuple([ np.random.choice([False, True]), [np.random.uniform(-3.0, 3.0) for _ in xrange(3)] ]) X = np.array([genrow() for _ in xrange(N)], dtype=[('', bool), ('', float, (3, ))]) view = dataview(X) defn = model_definition(N, [bb, niw(3)]) s = initialize_fn(defn, view, cluster_hp={'alpha': 2.}, feature_hps=[{ 'alpha': 2., 'beta': 2. }, { 'mu': mu, 'kappa': kappa, 'psi': psi, 'nu': nu }], r=R) bound_s = bind(s, view) for _ in xrange(10): gibbs_assign(bound_s, R)
def _test_multivariate_models(initialize_fn, dataview, bind, gibbs_assign, R): # XXX: this test only checks that the operations don't crash mu = np.ones(3) kappa = 0.3 Q = random_orthonormal_matrix(3) psi = np.dot(Q, np.dot(np.diag([1.0, 0.5, 0.2]), Q.T)) nu = 6 N = 10 def genrow(): return tuple( [np.random.choice([False, True]), [np.random.uniform(-3.0, 3.0) for _ in xrange(3)]]) X = np.array([genrow() for _ in xrange(N)], dtype=[('', bool), ('', float, (3,))]) view = dataview(X) defn = model_definition(N, [bb, niw(3)]) s = initialize_fn( defn, view, cluster_hp={'alpha': 2.}, feature_hps=[ {'alpha': 2., 'beta': 2.}, {'mu': mu, 'kappa': kappa, 'psi': psi, 'nu': nu} ], r=R) bound_s = bind(s, view) for _ in xrange(10): gibbs_assign(bound_s, R)
def test_mnist_supervised(): mnist_dataset = _get_mnist_dataset() classes = range(10) classmap = {c: i for i, c in enumerate(classes)} train_data, test_data = [], [] for c in classes: Y = mnist_dataset['data'][ np.where(mnist_dataset['target'] == float(c))[0]] Y_train, Y_test = train_test_split(Y, test_size=0.01) train_data.append(Y_train) test_data.append(Y_test) sample_size_max = 10000 def mk_class_data(c, Y): n, D = Y.shape print 'number of digit', c, 'in training is', n dtype = [('', bool)] * D + [('', int)] inds = np.random.permutation(Y.shape[0])[:sample_size_max] Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], dtype=dtype) return Y Y_train = np.hstack([mk_class_data(c, y) for c, y in zip(classes, train_data)]) Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] n, = Y_train.shape D = len(Y_train.dtype) print 'training data is', n, 'examples' print 'image dimension is', (D - 1), 'pixels' view = numpy_dataview(Y_train) defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{'alpha': 1., 'beta': 1.}] * (D - 1) + [{'alphas': [1. for _ in classes]}], r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D - 1)} hparams[D - 1] = { 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) for idx in xrange(len(classes)) } def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0,)], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False,) * (D - 1) + (True,)])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc #import matplotlib.pylab as plt #Y_prob = np.array([c for _, _, c in results]) #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) #plt.plot(fpr, tpr) #plt.show() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' # print group size breakdown sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] sizes = sorted(sizes, key=lambda x: x[1], reverse=True) print ' group_sizes=', sizes print_prediction_results() # save state mkdirp("mnist-states") fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) with open(fname, "w") as fp: fp.write(s.serialize()) # training iters = 30 for rid in xrange(iters): kernel(rid)
def test_mnist_supervised(n): mnist_dataset = _get_mnist_dataset() classes = range(10) classmap = {c: i for i, c in enumerate(classes)} train_data, test_data = [], [] for c in classes: Y = mnist_dataset['data'][np.where( mnist_dataset['target'] == float(c))[0]] Y_train, Y_test = train_test_split(Y, test_size=0.01) train_data.append(Y_train) test_data.append(Y_test) sample_size_max = n def mk_class_data(c, Y): n, D = Y.shape print 'number of digit', c, 'in training is', n dtype = [('', bool)] * D + [('', int)] inds = np.random.permutation(Y.shape[0])[:sample_size_max] Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], dtype=dtype) return Y Y_train = np.hstack( [mk_class_data(c, y) for c, y in zip(classes, train_data)]) Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] n, = Y_train.shape D = len(Y_train.dtype) print 'training data is', n, 'examples' print 'image dimension is', (D - 1), 'pixels' view = numpy_dataview(Y_train) defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * (D - 1) + [{ 'alphas': [1. for _ in classes] }], r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D - 1) } hparams[D - 1] = { 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) for idx in xrange(len(classes)) } def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0, )], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False, ) * (D - 1) + (True, )])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30) ] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc #import matplotlib.pylab as plt #Y_prob = np.array([c for _, _, c in results]) #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) #plt.plot(fpr, tpr) #plt.show() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' # training iters = 30 for rid in xrange(iters): kernel(rid) # print group size breakdown sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] sizes = sorted(sizes, key=lambda x: x[1], reverse=True) print ' group_sizes=', sizes #print_prediction_results() # save state mkdirp("mnist-states") fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) with open(fname, "w") as fp: fp.write(s.serialize())