def test_two_views_column_partition_normal__ci_(lovecat): D = retrieve_normal_dataset() engine = Engine(D.T, outputs=[5, 0, 1, 2, 3, 4], cctypes=['normal'] * len(D), rng=gu.gen_rng(12), num_states=64) if lovecat: engine.transition_lovecat(N=200) else: engine.transition(N=200) P = engine.dependence_probability_pairwise() R1 = engine.row_similarity_pairwise(cols=[5, 0, 1]) R2 = engine.row_similarity_pairwise(cols=[2, 3, 4]) pu.plot_clustermap(P) pu.plot_clustermap(R1) pu.plot_clustermap(R2) P_THEORY = [ [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] return engine
def state(): # Create an engine. engine = Engine(DATA, cctypes=['normal', 'categorical'], distargs=[None, { 'k': 6 }], num_states=4, rng=gu.gen_rng(212)) engine.transition(N=15) marginals = engine.logpdf_score() ranking = np.argsort(marginals)[::-1] return engine.get_state(ranking[0])
def test_logpdf_score_crash(): rng = gen_rng(8) # T = rng.choice([0,1], p=[.3,.7], size=250).reshape(-1,1) T = rng.normal(size=30).reshape(-1, 1) engine = Engine(T, cctypes=['normal'], rng=rng, num_states=4) logpdf_likelihood_initial = np.array(engine.logpdf_likelihood()) logpdf_score_initial = np.array(engine.logpdf_score()) assert np.all(logpdf_score_initial < logpdf_likelihood_initial) # assert np.all(logpdf_likelihood_initial < logpdf_score_initial) engine.transition(N=100) engine.transition(kernels=['column_hypers', 'view_alphas'], N=10) logpdf_likelihood_final = np.asarray(engine.logpdf_likelihood()) logpdf_score_final = np.asarray(engine.logpdf_score()) assert np.all(logpdf_score_final < logpdf_likelihood_final) assert np.max(logpdf_score_initial) < np.max(logpdf_score_final)
def launch_analysis(): engine = Engine(animals.values.astype(float), num_states=64, cctypes=['categorical'] * len(animals.values[0]), distargs=[{ 'k': 2 }] * len(animals.values[0]), rng=gu.gen_rng(7)) engine.transition(N=900) with open('resources/animals/animals.engine', 'w') as f: engine.to_pickle(f) engine = Engine.from_pickle(open('resources/animals/animals.engine', 'r')) D = engine.dependence_probability_pairwise() pu.plot_clustermap(D)
def test_two_views_row_partition_bernoulli__ci_(lovecat): D = retrieve_bernoulli_dataset() if lovecat: engine = Engine(D.T, cctypes=['categorical'] * len(D), distargs=[{ 'k': 2 }] * len(D), Zv={ 0: 0, 1: 0, 2: 1, 3: 1 }, rng=gu.gen_rng(12), num_states=64) engine.transition_lovecat(N=100, kernels=[ 'row_partition_assignments', 'row_partition_hyperparameters', 'column_hyperparameters', ]) else: engine = Engine(D.T, cctypes=['bernoulli'] * len(D), Zv={ 0: 0, 1: 0, 2: 1, 3: 1 }, rng=gu.gen_rng(12), num_states=64) engine.transition(N=100, kernels=[ 'view_alphas', 'rows', 'column_hypers', ]) R1 = engine.row_similarity_pairwise(cols=[0, 1]) R2 = engine.row_similarity_pairwise(cols=[2, 3]) pu.plot_clustermap(R1) pu.plot_clustermap(R2) return engine
def run_test(args): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] n_per_chain = int(float(n_rows) / n_chains) fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 9)) axes = axes.ravel() k = 0 for shape in shapes: print "Shape: %s" % shape T_o = np.asarray(gen_function[shape](n_rows)) T_i = [] engine = Engine(T_o.T, cctypes=cctypes, distargs=distargs, num_states=n_chains) engine.transition(N=n_iters) for chain in xrange(n_chains): state = engine.get_state(chain) print "chain %i of %i" % (chain + 1, n_chains) T_i.extend(state.simulate(-1, [0, 1], N=n_per_chain)) T_i = np.array(T_i) ax = axes[k] ax.scatter(T_o[0], T_o[1], color='blue', edgecolor='none') ax.set_xlabel("X") ax.set_ylabel("Y") ax.set_title("%s original" % shape) ax = axes[k + 4] ax.scatter(T_i[:, 0], T_i[:, 1], color='red', edgecolor='none') ax.set_xlabel("X") ax.set_ylabel("Y") ax.set_xlim(ax.get_xlim()) ax.set_ylim(ax.get_ylim()) ax.set_title("%s simulated" % shape) k += 1 print "Done." return fig
def get_engine(): cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) T = T.T # Make some nan cells for evidence. T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan T[8, 4] = np.nan engine = Engine(T, cctypes=cctypes, distargs=distargs, num_states=6, rng=gu.gen_rng(0)) engine.transition(N=2) return engine
def generate_gpmcc_posteriors(cctype, distargs, D_train, iters, seconds): """Learns gpmcc on D_train for seconds and simulates NUM_TEST times.""" # Learning and posterior simulation. engine = Engine(D_train, cctypes=[cctype], distargs=[distargs], num_states=64, rng=gu.gen_rng(1)) engine.transition(N=iters, S=seconds, progress=0) if iters: kernel = 'column_params' if cu.cctype_class(cctype).is_conditional()\ else 'column_hypers' engine.transition(N=100, kernels=[kernel], progress=0) samples = engine.simulate(-1, [0], N=NUM_TEST) marginals = engine.logpdf_score() ranking = np.argsort(marginals)[::-1] for r in ranking[:5]: engine.get_state(r).plot() return [samples[i] for i in ranking[:5]]
def test_bernoulli(): # Switch for multiprocess (0 is faster). multiprocess = 0 # Create categorical data of DATA_NUM_0 zeros and DATA_NUM_1 ones. data = np.transpose(np.array([[0] * DATA_NUM_0 + [1] * DATA_NUM_1])) # Run a single chain for a few iterations. engine = Engine(data, cctypes=['categorical'], distargs=[{ 'k': 2 }], rng=gu.gen_rng(0), multiprocess=0) engine.transition(NUM_ITER, multiprocess=multiprocess) # Simulate from hypothetical row and compute the proportion of ones. sample = engine.simulate(-1, [0], N=NUM_SIM, multiprocess=multiprocess)[0] sum_b = sum(s[0] for s in sample) observed_prob_of_1 = (float(sum_b) / float(NUM_SIM)) true_prob_of_1 = float(DATA_NUM_1) / float(DATA_NUM_0 + DATA_NUM_1) # Check 1% relative match. assert np.allclose(true_prob_of_1, observed_prob_of_1, rtol=.1) # Simulate from observed row as a crash test. sample = engine.simulate(1, [0], N=1, multiprocess=multiprocess) # Ensure normalized unobserved probabilities. p0_uob = engine.logpdf(-1, {0: 0}, multiprocess=multiprocess)[0] p1_uob = engine.logpdf(-1, {0: 1}, multiprocess=multiprocess)[0] assert np.allclose(gu.logsumexp([p0_uob, p1_uob]), 0) # A logpdf query constraining an observed returns an error. with pytest.raises(ValueError): engine.logpdf(1, {0: 0}, multiprocess=multiprocess) with pytest.raises(ValueError): engine.logpdf(1, {0: 1}, multiprocess=multiprocess)
def test_entropy_bernoulli_univariate__ci_(): rng = gen_rng(10) # Generate a univariate Bernoulli dataset. T = rng.choice([0,1], p=[.3,.7], size=250).reshape(-1,1) engine = Engine(T, cctypes=['bernoulli'], rng=rng, num_states=16) engine.transition(S=15) # exact computation. entropy_exact = - (.3*np.log(.3) + .7*np.log(.7)) # logpdf computation. logps = engine.logpdf_bulk([-1,-1], [{0:0}, {0:1}]) entropy_logpdf = [-np.sum(np.exp(logp)*logp) for logp in logps] # mutual_information computation. entropy_mi = engine.mutual_information([0], [0], N=1000) # Punt CLT analysis and go for 1 dp. assert np.allclose(entropy_exact, entropy_logpdf, atol=.1) assert np.allclose(entropy_exact, entropy_mi, atol=.1) assert np.allclose(entropy_logpdf, entropy_mi, atol=.05)
def test_two_views_row_partition_normal__ci_(lovecat): D = retrieve_normal_dataset() engine = Engine(D.T, cctypes=['normal'] * len(D), Zv={ 0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1 }, rng=gu.gen_rng(12), num_states=64) if lovecat: engine.transition_lovecat(N=100, kernels=[ 'row_partition_assignments', 'row_partition_hyperparameters', 'column_hyperparameters', ]) else: engine.transition(N=100, kernels=[ 'view_alphas', 'rows', 'column_hypers', ]) R1 = engine.row_similarity_pairwise(cols=[0, 1, 2]) R2 = engine.row_similarity_pairwise(cols=[3, 4, 5]) pu.plot_clustermap(R1) pu.plot_clustermap(R2) return engine
def test_two_views_column_partition_bernoulli__ci_(lovecat): D = retrieve_bernoulli_dataset() engine = Engine(D.T, cctypes=['categorical'] * len(D), distargs=[{ 'k': 2 }] * len(D), rng=gu.gen_rng(12), num_states=64) if lovecat: engine.transition_lovecat(N=200) else: # engine = Engine( # D.T, # cctypes=['bernoulli']*len(D), # rng=gu.gen_rng(12), # num_states=64) engine.transition(N=200) P = engine.dependence_probability_pairwise() R1 = engine.row_similarity_pairwise(cols=[0, 1]) R2 = engine.row_similarity_pairwise(cols=[2, 3]) pu.plot_clustermap(P) pu.plot_clustermap(R1) pu.plot_clustermap(R2) P_THEORY = [ [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] return engine
def test_incorporate_engine(): engine = Engine( T[:,:2], cctypes=CCTYPES[:2], distargs=DISTARGS[:2], num_states=4, rng=gu.gen_rng(0), ) engine.transition(N=5) # Incorporate a new dim into with a non-contiguous output. engine.incorporate_dim( T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2] ) engine.transition(N=2) # Serialize the engine, and run a targeted transtion on variable 10. m = engine.to_metadata() engine2 = Engine.from_metadata(m) engine2.transition(N=2, cols=[10], multiprocess=0) assert all(s.outputs == [0,1,10] for s in engine.states)
N_ROWS = 300 N_STATES = 12 N_ITERS = 100 cctypes = ['categorical(k={})'.format(N_ROWS)] + ['normal']*8 cctypes, distargs = cu.parse_distargs(cctypes) column_names = ['id'] + ['one cluster']*4 + ['four cluster']*4 # id column. X = np.zeros((N_ROWS, 9)) X[:,0] = np.arange(N_ROWS) # Four columns of one cluster from the standard normal. X[:,1:5] = np.random.randn(N_ROWS, 4) # Four columns of four clusters with unit variance and means \in {0,1,2,3}. Z = np.random.randint(4, size=(N_ROWS)) X[:,5:] = 4*np.reshape(np.repeat(Z,4), (len(Z),4)) + np.random.randn(N_ROWS, 4) # Inference. engine = Engine( X, cctypes=cctypes, distargs=distargs, num_states=N_STATES) engine.transition(N=N_ITERS) # Dependence probability. D = engine.dependence_probability_pairwise() zmat = sns.clustermap(D, yticklabels=column_names, xticklabels=column_names) plt.setp(zmat.ax_heatmap.get_yticklabels(), rotation=0) plt.setp(zmat.ax_heatmap.get_xticklabels(), rotation=90) plt.show()