def test_slice_theta_mm(): N = 100 data = np.array([(np.random.random() < 0.8, ) for _ in xrange(N)], dtype=[('', bool)]) defn = model_definition(N, [bbnc]) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} view = numpy_dataview(data) s = initialize(defn, view, cluster_hp={ 'alpha': 1., 'beta': 9. }, feature_hps=[prior], r=r, assignment=[0] * N) heads = len([1 for y in data if y[0]]) tails = N - heads alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails bs = bind(s, view) params = {0: {'p': 0.05}} def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, 0)['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def _test_convergence_bb_cxx(N, D, kernel, preprocess_data_fn=None, nonconj=False, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, kl_places=2): r = rng() cluster_hp = {'alpha': 2.0} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D defn = model_definition(N, [bb] * D) nonconj_defn = model_definition(N, [bbnc] * D) Y, posterior = data_with_posterior(defn, cluster_hp, feature_hps, preprocess_data_fn) data = numpy_dataview(Y) s = initialize(nonconj_defn if nonconj else defn, data, cluster_hp=cluster_hp, feature_hps=feature_hps, r=r) bs = bind(s, data) wrapped_kernel = lambda s: kernel(s, r) _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip, ntries, nsamples, kl_places)
def test_get_set_params(): defn = model_definition(1, [bb, bnb, gp, nich]) data = np.array([ (True, 3, 5, 10.), ], dtype=[('', bool), ('', int), ('', int), ('', float)]) s = initialize(defn=defn, data=numpy_dataview(data), r=rng()) s.set_cluster_hp({'alpha': 3.0}) assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0}) hyperparams = [ { 'alpha': 1.2, 'beta': 4.3 }, { 'alpha': 1., 'beta': 1., 'r': 1 }, { 'alpha': 1., 'inv_beta': 1. }, { 'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1. }, ] for i, hp in enumerate(hyperparams): s.set_feature_hp(i, hp) assert_dict_almost_equals(s.get_feature_hp(i), hp)
def test_runner_multiprocessing_convergence(): N, D = 4, 5 defn = model_definition(N, [bb] * D) prng = rng() Y, posterior = data_with_posterior(defn, r=prng) view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(mp.cpu_count())] runners = [runner.runner(defn, view, latent, ['assign']) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=1000) # burnin idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_iter(): r.run(r=prng, niters=10) for latent in r.get_latents(): yield idmap[tuple(permutation_canonical(latent.assignments()))] ref = [None] def sample_fn(): if ref[0] is None: ref[0] = sample_iter() try: return next(ref[0]) except StopIteration: ref[0] = None return sample_fn() assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
def _test_convergence_bb_cxx(N, D, kernel, preprocess_data_fn=None, nonconj=False, burnin_niters=10000, skip=10, ntries=50, nsamples=1000, kl_places=2): r = rng() cluster_hp = {'alpha': 2.0} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D defn = model_definition(N, [bb] * D) nonconj_defn = model_definition(N, [bbnc] * D) Y, posterior = data_with_posterior( defn, cluster_hp, feature_hps, preprocess_data_fn) data = numpy_dataview(Y) s = initialize(nonconj_defn if nonconj else defn, data, cluster_hp=cluster_hp, feature_hps=feature_hps, r=r) bs = bind(s, data) wrapped_kernel = lambda s: kernel(s, r) _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip, ntries, nsamples, kl_places)
def test_slice_theta_mm(): N = 100 data = np.array( [(np.random.random() < 0.8,) for _ in xrange(N)], dtype=[('', bool)]) defn = model_definition(N, [bbnc]) r = rng() prior = {'alpha': 1.0, 'beta': 9.0} view = numpy_dataview(data) s = initialize( defn, view, cluster_hp={'alpha': 1., 'beta': 9.}, feature_hps=[prior], r=r, assignment=[0] * N) heads = len([1 for y in data if y[0]]) tails = N - heads alpha1 = prior['alpha'] + heads beta1 = prior['beta'] + tails bs = bind(s, view) params = {0: {'p': 0.05}} def sample_fn(): theta(bs, r, tparams=params) return s.get_suffstats(0, 0)['p'] rv = beta(alpha1, beta1) assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
def crp_score(assignment): latent = initialize(defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment()
def test_posterior_predictive_statistic(): N, D = 10, 4 # D needs to be even defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] q = ma.masked_array( np.array([(False,) * D], dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)]) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge='mode') assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) statistic = query.posterior_predictive_statistic( q, latents, prng, merge=['mode', 'mode', 'avg', 'avg']) assert_equals(statistic.shape, (1,)) assert_equals(len(statistic.dtype), D) q = ma.masked_array( np.array([(False,) * D] * 3, dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3) statistic = query.posterior_predictive_statistic(q, latents, prng) assert_equals(statistic.shape, (3,)) assert_equals(len(statistic.dtype), D)
def score_fn(assignment): s = initialize(defn, data, r, cluster_hp=cluster_hp, feature_hps=feature_hps, assignment=assignment) return s.score_joint(r)
def test_zmatrix(): N, D = 10, 4 defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] zmat = query.zmatrix(latents) assert_equals(zmat.shape, (N, N))
def test_runner_multyvac(): defn = model_definition(10, [bb, nich, niw(3)]) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = runner.default_kernel_config(defn) prng = rng() latents = [model.initialize(defn, view, prng) for _ in xrange(2)] runners = [runner.runner(defn, view, latent, kc) for latent in latents] r = parallel.runner(runners, backend='multyvac', layer='perf', core='f2') r.run(r=prng, niters=1000) r.run(r=prng, niters=1000)
def _test_scalar_hp_inference(view, prior_fn, w, grid_min, grid_max, grid_n, likelihood_model, scalar_hp_key, burnin=1000, nsamples=1000, every=10, trials=100, places=2): """ view must be 1D """ r = rng() hparams = {0: {scalar_hp_key: (prior_fn, w)}} def score_fn(scalar): d = latent.get_feature_hp(0) prev_scalar = d[scalar_hp_key] d[scalar_hp_key] = scalar latent.set_feature_hp(0, d) score = prior_fn(scalar) + latent.score_data(0, None, r) d[scalar_hp_key] = prev_scalar latent.set_feature_hp(0, d) return score defn = model_definition(len(view), [likelihood_model]) latent = initialize(defn, view, r=r) model = bind(latent, view) def sample_fn(): for _ in xrange(every): slice_hp(model, r, hparams=hparams) return latent.get_feature_hp(0)[scalar_hp_key] for _ in xrange(burnin): slice_hp(model, r, hparams=hparams) print 'finished burnin of', burnin, 'iterations' print 'grid_min', grid_min, 'grid_max', grid_max assert_1d_cont_dist_approx_emp(sample_fn, score_fn, grid_min, grid_max, grid_n, trials, nsamples, places)
def test_runner_multiprocessing(): defn = model_definition(10, [bb, nich, niw(3)]) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = runner.default_kernel_config(defn) prng = rng() latents = [model.initialize(defn, view, prng) for _ in xrange(mp.cpu_count())] runners = [runner.runner(defn, view, latent, kc) for latent in latents] r = parallel.runner(runners) # check it is restartable r.run(r=prng, niters=10) r.run(r=prng, niters=10)
def run_dpgmm(niter=1000, datadir="../../", nfeatures=13): ranking = [10, 6, 7, 26, 5, 8, 4, 19, 12, 23, 24, 33, 28, 25, 14, 3, 0, 1, 21, 30, 11, 31, 13, 9, 22, 2, 27, 29, 32, 17, 18, 20, 16, 15] features, labels, lc, hr, tstart, \ features_lb, labels_lb, lc_lb, hr_lb, \ fscaled, fscaled_lb, fscaled_full, labels_all = \ load_data(datadir, tseg=1024.0, log_features=None, ranking=ranking) labels_phys = feature_engineering.convert_labels_to_physical(labels) labels_phys_lb = feature_engineering.convert_labels_to_physical(labels_lb) labels_all_phys = np.hstack([labels_phys["train"], labels_phys["val"], labels_phys["test"]]) fscaled_small = fscaled_full[:, :13] nchains = 8 # The random state object prng = rng() # Define a DP-GMM where the Gaussian is 2D defn = model_definition(fscaled_small.shape[0], [normal_inverse_wishart(fscaled_small.shape[1])]) fscaled_rec = np.array([(list(f),) for f in fscaled_small], dtype=[('', np.float32, fscaled_small.shape[1])]) # Create a wrapper around the numpy recarray which # data-microscopes understands view = numpy_dataview(fscaled_rec) # Initialize nchains start points randomly in the state space latents = [model.initialize(defn, view, prng) for _ in xrange(nchains)] # Create a runner for each chain runners = [runner.runner(defn, view, latent, kernel_config=['assign']) for latent in latents] r = parallel.runner(runners) r.run(r=prng, niters=niter) with open(datadir+"grs1915_dpgmm.pkl", "w") as f: pickle.dump(r, f) return
def test_get_set_params(): defn = model_definition(1, [bb, bnb, gp, nich]) data = np.array([(True, 3, 5, 10.), ], dtype=[('', bool), ('', int), ('', int), ('', float)]) s = initialize(defn=defn, data=numpy_dataview(data), r=rng()) s.set_cluster_hp({'alpha': 3.0}) assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0}) hyperparams = [ {'alpha': 1.2, 'beta': 4.3}, {'alpha': 1., 'beta': 1., 'r': 1}, {'alpha': 1., 'inv_beta': 1.}, {'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1.}, ] for i, hp in enumerate(hyperparams): s.set_feature_hp(i, hp) assert_dict_almost_equals(s.get_feature_hp(i), hp)
def _test_scalar_hp_inference(view, prior_fn, w, grid_min, grid_max, grid_n, likelihood_model, scalar_hp_key, burnin=1000, nsamples=1000, every=10, trials=100, places=2): """ view must be 1D """ r = rng() hparams = {0: {scalar_hp_key: (prior_fn, w)}} def score_fn(scalar): d = latent.get_feature_hp(0) prev_scalar = d[scalar_hp_key] d[scalar_hp_key] = scalar latent.set_feature_hp(0, d) score = prior_fn(scalar) + latent.score_data(0, None, r) d[scalar_hp_key] = prev_scalar latent.set_feature_hp(0, d) return score defn = model_definition(len(view), [likelihood_model]) latent = initialize(defn, view, r=r) model = bind(latent, view) def sample_fn(): for _ in xrange(every): slice_hp(model, r, hparams=hparams) return latent.get_feature_hp(0)[scalar_hp_key] for _ in xrange(burnin): slice_hp(model, r, hparams=hparams) print 'finished burnin of', burnin, 'iterations' print 'grid_min', grid_min, 'grid_max', grid_max assert_1d_cont_dist_approx_emp(sample_fn, score_fn, grid_min, grid_max, grid_n, trials, nsamples, places)
def test_runner_convergence(): N, D = 4, 5 defn = model_definition(N, [bb] * D) prng = rng() Y, posterior = data_with_posterior(defn, r=prng) view = numpy_dataview(Y) latent = model.initialize(defn, view, prng) r = runner.runner(defn, view, latent, ['assign']) r.run(r=prng, niters=1000) # burnin idmap = {C: i for i, C in enumerate(permutation_iter(N))} def sample_fn(): r.run(r=prng, niters=10) new_latent = r.get_latent() return idmap[tuple(permutation_canonical(new_latent.assignments()))] assert_discrete_dist_approx(sample_fn, posterior, ntries=100)
def latent(groups, entities_per_group, features, r): N = groups * entities_per_group defn = model_definition(N, [bb] * features) # generate fake data Y = np.random.random(size=(N, features)) <= 0.5 view = numpy_dataview( np.array([tuple(y) for y in Y], dtype=[('', bool)] * features)) # assign entities to their respective groups assignment = [[g] * entities_per_group for g in xrange(groups)] assignment = list(it.chain.from_iterable(assignment)) latent = bind(initialize(defn, view, r, assignment=assignment), view) latent.create_group(r) # perftest() doesnt modify group assignments return latent
def test_posterior_predictive(): N, D = 10, 4 # D needs to be even defn = model_definition(N, [bb] * D) Y = toy_dataset(defn) prng = rng() view = numpy_dataview(Y) latents = [model.initialize(defn, view, prng) for _ in xrange(10)] q = ma.masked_array( np.array([(False,) * D], dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)]) samples = query.posterior_predictive(q, latents, prng) assert_equals(samples.shape, (1, len(latents))) q = ma.masked_array( np.array([(False,) * D] * 3, dtype=[('', bool)] * D), mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3) samples = query.posterior_predictive(q, latents, prng) assert_equals(samples.shape, (3, len(latents)))
def _test_runner_kernel_config(kc_fn, models): defn = model_definition(10, models) Y = toy_dataset(defn) view = numpy_dataview(Y) kc = kc_fn(defn) prng = rng() ntries = 5 while ntries: latent = model.initialize(defn, view, prng) assignments = latent.assignments() r = runner.runner(defn, view, latent, kc) r.run(r=prng, niters=10) assignments1 = r.get_latent().assignments() # XXX: it should be very unlikely the assignments are all equal if assignments == assignments1: ntries -= 1 else: return # success assert_true(False) # exceeded ntries
def test_mnist_supervised(n): mnist_dataset = _get_mnist_dataset() classes = range(10) classmap = {c: i for i, c in enumerate(classes)} train_data, test_data = [], [] for c in classes: Y = mnist_dataset['data'][np.where( mnist_dataset['target'] == float(c))[0]] Y_train, Y_test = train_test_split(Y, test_size=0.01) train_data.append(Y_train) test_data.append(Y_test) sample_size_max = n def mk_class_data(c, Y): n, D = Y.shape print 'number of digit', c, 'in training is', n dtype = [('', bool)] * D + [('', int)] inds = np.random.permutation(Y.shape[0])[:sample_size_max] Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], dtype=dtype) return Y Y_train = np.hstack( [mk_class_data(c, y) for c, y in zip(classes, train_data)]) Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] n, = Y_train.shape D = len(Y_train.dtype) print 'training data is', n, 'examples' print 'image dimension is', (D - 1), 'pixels' view = numpy_dataview(Y_train) defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * (D - 1) + [{ 'alphas': [1. for _ in classes] }], r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D - 1) } hparams[D - 1] = { 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) for idx in xrange(len(classes)) } def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0, )], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False, ) * (D - 1) + (True, )])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30) ] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc #import matplotlib.pylab as plt #Y_prob = np.array([c for _, _, c in results]) #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) #plt.plot(fpr, tpr) #plt.show() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' # training iters = 30 for rid in xrange(iters): kernel(rid) # print group size breakdown sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] sizes = sorted(sizes, key=lambda x: x[1], reverse=True) print ' group_sizes=', sizes #print_prediction_results() # save state mkdirp("mnist-states") fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) with open(fname, "w") as fp: fp.write(s.serialize())
def crp_score(assignment): latent = initialize( defn, view, r=r, cluster_hp={'alpha': alpha}, assignment=assignment) return latent.score_assignment()
def test_mnist(): import matplotlib.pylab as plt from PIL import Image, ImageOps mnist_dataset = _get_mnist_dataset() Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]] Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]] print 'number of twos:', Y_2.shape[0] print 'number of threes:', Y_3.shape[0] _, D = Y_2.shape W = int(math.sqrt(D)) assert W * W == D dtype = [('', bool)] * D Y = np.vstack([Y_2, Y_3]) Y = np.array( [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]], dtype=dtype) view = numpy_dataview(Y) defn = model_definition(Y.shape[0], [bb] * D) r = rng() s = initialize( defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{'alpha': 1., 'beta': 1.}] * D, r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D)} def plot_clusters(s, fname, scalebysize=False): hps = [s.get_feature_hp(i) for i in xrange(D)] def prior_prob(hp): return hp['alpha'] / (hp['alpha'] + hp['beta']) def data_for_group(gid): suffstats = [s.get_suffstats(gid, i) for i in xrange(D)] def prob(hp, ss): top = hp['alpha'] + ss['heads'] bot = top + hp['beta'] + ss['tails'] return top / bot probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)] return np.array(probs) def scale(d, weight): im = d.reshape((W, W)) newW = max(int(weight * W), 1) im = Image.fromarray(im) im = im.resize((newW, newW)) im = ImageOps.expand(im, border=(W - newW) / 2) im = np.array(im) a, b = im.shape #print 'a,b:', a, b if a < W: im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0) elif a > W: im = im[:W, :] assert im.shape[0] == W if b < W: #print 'current:', im.shape im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1) elif b > W: im = im[:, :W] assert im.shape[1] == W return im.flatten() data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)] largest = max(cnt for _, cnt in data) data = [scale(d, cnt / float(largest)) if scalebysize else d for d, cnt in data] digits_per_row = 12 rem = len(data) % digits_per_row if rem: fill = digits_per_row - rem for _ in xrange(fill): data.append(np.zeros(D)) assert not (len(data) % digits_per_row) #rows = len(data) / digits_per_row data = np.vstack([np.hstack([d.reshape((W, W)) for d in data[i:i + digits_per_row]]) for i in xrange(0, len(data), digits_per_row)]) #print 'saving figure', fname plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig(fname) plt.close() def plot_hyperparams(s, fname): hps = [s.get_feature_hp(i) for i in xrange(D)] alphas = np.array([hp['alpha'] for hp in hps]) betas = np.array([hp['beta'] for hp in hps]) data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))]) plt.imshow(data, interpolation='nearest') plt.colorbar() plt.savefig(fname) plt.close() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' return s.score_joint(r) # burnin burnin = 20 for rid in xrange(burnin): print 'score:', kernel(rid) print 'finished burnin' plot_clusters(s, 'mnist_clusters.pdf') plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True) plot_hyperparams(s, 'mnist_hyperparams.pdf') print 'groupcounts:', groupcounts(s) # posterior predictions present = D / 2 absent = D - present queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \ [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]] queries_masked = ma.masked_array( np.array(queries, dtype=[('', bool)] * D), mask=[(False,) * present + (True,) * absent]) def postpred_sample(y_new): Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)] Y_samples = np.array([list(y) for y in np.hstack(Y_samples)]) Y_avg = Y_samples.mean(axis=0) return Y_avg queries_masked = [postpred_sample(y) for y in queries_masked] data0 = np.hstack([q.reshape((W, W)) for q in queries_masked]) data1 = np.hstack( [np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W)) for q in queries]) data = np.vstack([data0, data1]) plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig('mnist_predict.pdf') plt.close()
def test_mnist_supervised(): mnist_dataset = _get_mnist_dataset() classes = range(10) classmap = {c: i for i, c in enumerate(classes)} train_data, test_data = [], [] for c in classes: Y = mnist_dataset['data'][ np.where(mnist_dataset['target'] == float(c))[0]] Y_train, Y_test = train_test_split(Y, test_size=0.01) train_data.append(Y_train) test_data.append(Y_test) sample_size_max = 10000 def mk_class_data(c, Y): n, D = Y.shape print 'number of digit', c, 'in training is', n dtype = [('', bool)] * D + [('', int)] inds = np.random.permutation(Y.shape[0])[:sample_size_max] Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]], dtype=dtype) return Y Y_train = np.hstack([mk_class_data(c, y) for c, y in zip(classes, train_data)]) Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))] n, = Y_train.shape D = len(Y_train.dtype) print 'training data is', n, 'examples' print 'image dimension is', (D - 1), 'pixels' view = numpy_dataview(Y_train) defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))]) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{'alpha': 1., 'beta': 1.}] * (D - 1) + [{'alphas': [1. for _ in classes]}], r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D - 1)} hparams[D - 1] = { 'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5) for idx in xrange(len(classes)) } def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0,)], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False,) * (D - 1) + (True,)])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc #import matplotlib.pylab as plt #Y_prob = np.array([c for _, _, c in results]) #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0) #plt.plot(fpr, tpr) #plt.show() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' # print group size breakdown sizes = [(gid, s.groupsize(gid)) for gid in s.groups()] sizes = sorted(sizes, key=lambda x: x[1], reverse=True) print ' group_sizes=', sizes print_prediction_results() # save state mkdirp("mnist-states") fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid)) with open(fname, "w") as fp: fp.write(s.serialize()) # training iters = 30 for rid in xrange(iters): kernel(rid)
def test_mnist(): import matplotlib.pylab as plt from PIL import Image, ImageOps mnist_dataset = _get_mnist_dataset() Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]] Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]] print 'number of twos:', Y_2.shape[0] print 'number of threes:', Y_3.shape[0] _, D = Y_2.shape W = int(math.sqrt(D)) assert W * W == D dtype = [('', bool)] * D Y = np.vstack([Y_2, Y_3]) Y = np.array( [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]], dtype=dtype) view = numpy_dataview(Y) defn = model_definition(Y.shape[0], [bb] * D) r = rng() s = initialize(defn, view, cluster_hp={'alpha': 0.2}, feature_hps=[{ 'alpha': 1., 'beta': 1. }] * D, r=r) bound_s = bind(s, view) indiv_prior_fn = log_exponential(1.2) hparams = { i: { 'alpha': (indiv_prior_fn, 1.5), 'beta': (indiv_prior_fn, 1.5), } for i in xrange(D) } def plot_clusters(s, fname, scalebysize=False): hps = [s.get_feature_hp(i) for i in xrange(D)] def prior_prob(hp): return hp['alpha'] / (hp['alpha'] + hp['beta']) def data_for_group(gid): suffstats = [s.get_suffstats(gid, i) for i in xrange(D)] def prob(hp, ss): top = hp['alpha'] + ss['heads'] bot = top + hp['beta'] + ss['tails'] return top / bot probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)] return np.array(probs) def scale(d, weight): im = d.reshape((W, W)) newW = max(int(weight * W), 1) im = Image.fromarray(im) im = im.resize((newW, newW)) im = ImageOps.expand(im, border=(W - newW) / 2) im = np.array(im) a, b = im.shape #print 'a,b:', a, b if a < W: im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0) elif a > W: im = im[:W, :] assert im.shape[0] == W if b < W: #print 'current:', im.shape im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1) elif b > W: im = im[:, :W] assert im.shape[1] == W return im.flatten() data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)] largest = max(cnt for _, cnt in data) data = [ scale(d, cnt / float(largest)) if scalebysize else d for d, cnt in data ] digits_per_row = 12 rem = len(data) % digits_per_row if rem: fill = digits_per_row - rem for _ in xrange(fill): data.append(np.zeros(D)) assert not (len(data) % digits_per_row) #rows = len(data) / digits_per_row data = np.vstack([ np.hstack([d.reshape((W, W)) for d in data[i:i + digits_per_row]]) for i in xrange(0, len(data), digits_per_row) ]) #print 'saving figure', fname plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig(fname) plt.close() def plot_hyperparams(s, fname): hps = [s.get_feature_hp(i) for i in xrange(D)] alphas = np.array([hp['alpha'] for hp in hps]) betas = np.array([hp['beta'] for hp in hps]) data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))]) plt.imshow(data, interpolation='nearest') plt.colorbar() plt.savefig(fname) plt.close() def kernel(rid): start0 = time.time() assign(bound_s, r) sec0 = time.time() - start0 start1 = time.time() hp(bound_s, r, hparams=hparams) sec1 = time.time() - start1 print 'rid=', rid, 'nclusters=', s.ngroups(), \ 'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec' sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups()))) print ' time_per_post_pred=', sec_per_post_pred, 'sec' return s.score_joint(r) # burnin burnin = 20 for rid in xrange(burnin): print 'score:', kernel(rid) print 'finished burnin' plot_clusters(s, 'mnist_clusters.pdf') plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True) plot_hyperparams(s, 'mnist_hyperparams.pdf') print 'groupcounts:', groupcounts(s) # posterior predictions present = D / 2 absent = D - present queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \ [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]] queries_masked = ma.masked_array(np.array(queries, dtype=[('', bool)] * D), mask=[(False, ) * present + (True, ) * absent]) def postpred_sample(y_new): Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)] Y_samples = np.array([list(y) for y in np.hstack(Y_samples)]) Y_avg = Y_samples.mean(axis=0) return Y_avg queries_masked = [postpred_sample(y) for y in queries_masked] data0 = np.hstack([q.reshape((W, W)) for q in queries_masked]) data1 = np.hstack([ np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W)) for q in queries ]) data = np.vstack([data0, data1]) plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest') plt.savefig('mnist_predict.pdf') plt.close()