def test_incorporate_session(): rng = gu.gen_rng(4) state = State(X, cctypes=['normal'] * 5, Zv={ 0: 0, 1: 0, 2: 1, 3: 1, 4: 2 }, rng=rng) # Incorporate row into a singleton cluster for all views. previous = [len(state.views[v].Nk()) for v in [0, 1, 2]] data = {i: rng.normal() for i in xrange(5)} clusters = { state.views[0].outputs[0]: previous[0], state.views[1].outputs[0]: previous[1], state.views[2].outputs[0]: previous[2], } state.incorporate(state.n_rows(), gu.merged(data, clusters)) assert [len(state.views[v].Nk()) for v in [0,1,2]] == \ [p+1 for p in previous] # Incorporate row without specifying clusters, and some missing values data = {i: rng.normal() for i in xrange(2)} state.incorporate(state.n_rows(), data) state.transition(N=3) # Remove the incorporated rowid. state.unincorporate(state.n_rows() - 1) state.transition(N=3)
def test_categorical_forest_manual_inputs_errors(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(1)) state.transition(N=1, progress=False) cat_id = CCTYPES.index('categorical') # Put 1201 into the first view. view_idx = min(state.views) state.incorporate_dim( T[:,CCTYPES.index('categorical')], outputs=[1201], cctype='categorical', distargs=DISTARGS[cat_id], v=view_idx) # Updating cctype with completely invalid input should raise. with pytest.raises(Exception): distargs = DISTARGS[cat_id].copy() distargs['inputs'] = [10000] state.update_cctype(1201, 'random_forest', distargs=distargs) # Updating cctype with input dimensions outside the view should raise. cols_in_view = state.views[view_idx].dims.keys() cols_out_view = [c for c in state.outputs if c not in cols_in_view] assert len(cols_in_view) > 0 and len(cols_out_view) > 0 with pytest.raises(Exception): distargs = DISTARGS[cat_id].copy() distargs['inputs'] = cols_out_view state.update_cctype(1201, 'random_forest', distargs=distargs) # Updating cctype with no input dimensions should raise. with pytest.raises(Exception): distargs = DISTARGS[cat_id].copy() distargs['inputs'] = [] state.update_cctype(1201, 'random_forest', distargs=distargs)
def test_cmi_different_views__ci_(): rng = gen_rng(0) T = np.zeros((50,3)) T[:,0] = rng.normal(loc=-5, scale=1, size=50) T[:,1] = rng.normal(loc=2, scale=2, size=50) T[:,2] = rng.normal(loc=12, scale=3, size=50) state = State( T, outputs=[0, 1, 2], cctypes=['normal','normal','normal'], Zv={0:0, 1:1, 2:2}, rng=rng ) state.transition(N=30, kernels=['alpha','view_alphas','column_params','column_hypers','rows']) mi01 = state.mutual_information([0], [1]) mi02 = state.mutual_information([0], [2]) mi12 = state.mutual_information([1], [2]) # Marginal MI all zero. assert np.allclose(mi01, 0) assert np.allclose(mi02, 0) assert np.allclose(mi12, 0) # CMI on variable in other view equal to MI. assert np.allclose(state.mutual_information([0], [1], {2:10}), mi01) assert np.allclose(state.mutual_information([0], [2], {1:0}), mi02) assert np.allclose(state.mutual_information([1], [2], {0:-2}), mi12) assert np.allclose(state.mutual_information([1], [2], {0:None}, T=5), mi12)
def test_poisson_categorical(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0)) state.transition(N=1, progress=False) state.update_cctype(CCTYPES.index('categorical'), 'poisson') state.transition(N=1, progress=False) state.update_cctype(CCTYPES.index('categorical'), 'categorical', distargs={'k':2})
def test_naive_bayes_independence(): rng = gu.gen_rng(1) D = rng.normal(size=(10, 1)) T = np.repeat(D, 10, axis=1) Ci = list(itertools.combinations(range(10), 2)) state = State(T, cctypes=['normal'] * 10, Ci=Ci, rng=rng) state.transition(N=10, progress=0) vu.validate_crp_constrained_partition(state.Zv(), [], Ci, {}, {})
def test_complex_independent_relationships(): rng = gu.gen_rng(1) D = rng.normal(size=(10, 1)) T = np.repeat(D, 10, axis=1) Ci = [(2, 8), (0, 3)] state = State(T, cctypes=['normal'] * 10, Ci=Ci, rng=rng) state.transition(N=10, progress=0) vu.validate_crp_constrained_partition(state.Zv(), [], Ci, {}, {})
def test_geometric_exponential(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0)) state.transition(N=1, progress=False) state.update_cctype(CCTYPES.index('geometric'), 'exponential') state.transition(N=1, progress=False) # Incompatible numeric conversion. with pytest.raises(Exception): state.update_cctype(CCTYPES.index('exponential'), 'geometric')
def test_vonmises_normal(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0)) state.transition(N=1, progress=False) state.update_cctype(CCTYPES.index('vonmises'), 'normal') state.transition(N=1, progress=False) state.update_cctype(CCTYPES.index('vonmises'), 'vonmises') # Incompatible numeric conversion. with pytest.raises(Exception): state.update_cctype(CCTYPES.index('normal'), 'vonmises')
def generate_gaussian_samples(): state = State(D, cctypes=['normal', 'normal'], Zv={ 0: 0, 1: 0 }, rng=gu.gen_rng(0)) view = state.view_for(1) state.transition(S=15, kernels=['rows', 'column_params', 'column_hypers']) samples = view.simulate(-1, [0, 1, view.outputs[0]], N=100) return [replace_key(s, view.outputs[0], -1) for s in samples]
def init_view_state(data, iters, cctypes): if isinstance(data, list): data = np.array(data) D = len(data[0]) outputs = range(D) X = {c: data[:, i].tolist() for i, c in enumerate(outputs)} view = View(X, cctypes=cctypes, outputs=[1000] + outputs, rng=RNG) state = State(data[:, 0:D], outputs=outputs, cctypes=cctypes, rng=RNG) if iters > 0: view.transition(iters) state.transition(iters) return view, state
def state(): rng = gu.gen_rng(5) rows = 120 cctypes = ['normal', 'bernoulli', 'normal'] G = generate_quadrants(rows, rng) B, Zv, Zrv = tu.gen_data_table(rows, [1], [[.5, .5]], ['bernoulli'], [None], [.95], rng=rng) T = np.column_stack((G, B.T))[:, [0, 2, 1]] state = State(T, outputs=[0, 1, 2], cctypes=cctypes, rng=rng) state.transition(N=20) return state
def test_simple_dependence_constraint(Ci): rng = gu.gen_rng(1) D = rng.normal(size=(10, 1)) T = np.repeat(D, 10, axis=1) Cd = [(2, 0), (8, 3)] state = State(T, cctypes=['normal'] * 10, Ci=Ci, Cd=Cd, rng=rng) with pytest.raises(ValueError): # Cannot transition columns with dependencies. state.transition(N=10, kernels=['columns'], progress=0) state.transition( N=10, kernels=['rows', 'alpha', 'column_hypers', 'alpha', 'view_alphas'], progress=False) vu.validate_crp_constrained_partition(state.Zv(), Cd, Ci, {}, {})
def generate_regression_samples(): state = State(D, cctypes=['normal', 'normal'], Zv={ 0: 0, 1: 0 }, rng=gu.gen_rng(4)) view = state.view_for(1) assert not state._composite state.update_cctype(1, 'linear_regression') assert state._composite state.transition(S=30, kernels=['rows', 'column_params', 'column_hypers']) samples = view.simulate(-1, [0, 1, view.outputs[0]], N=100) return [replace_key(s, view.outputs[0], -1) for s in samples]
def test_linreg_missing_data_ignore(): dataset = [ [1, 3, 1], [2, 4, 1.5], [float('nan'), 5, 1]] state = State(dataset, cctypes=['normal']*3, Zv={0:0, 1:0, 2:0}, rng=gu.gen_rng(1)) # Make sure that missing covariates are handles as missing cell. state.update_cctype(2, 'linear_regression', distargs={'inputs': [0,1]}) assert state.dim_for(2).inputs[1:] == [0,1] state.transition(N=5, kernels=['rows', 'column_hypers', 'view_alphas']) state.update_cctype(2, 'normal', distargs={'inputs': [0,1]}) # Make sure that specified inputs are set correctly. state.update_cctype(2, 'linear_regression', distargs={'inputs': [1]}) assert state.dim_for(2).inputs[1:] == [1]
def state(): cctypes, distargs = cu.parse_distargs( ['categorical(k=5)', 'normal', 'poisson', 'bernoulli']) T, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) s = State(T.T, cctypes=cctypes, distargs=distargs, Zv={i: 0 for i in xrange(len(cctypes))}, rng=gu.gen_rng(0)) s.update_cctype(0, 'random_forest', distargs={'k': 5}) # XXX Uncomment me for a bug! # state.update_cctype(1, 'linear_regression') kernels = [ 'rows', 'view_alphas', 'alpha', 'column_params', 'column_hypers' ] s.transition(N=1, kernels=kernels) return s
def test_categorical_forest(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(1)) state.transition(N=1, progress=False) cat_id = CCTYPES.index('categorical') # If cat_id is singleton migrate first. if len(state.view_for(cat_id).dims) == 1: distargs = DISTARGS[cat_id].copy() state.unincorporate_dim(cat_id) state.incorporate_dim( T[:,cat_id], outputs=[cat_id], cctype='categorical', distargs=distargs, v=0) state.update_cctype(cat_id, 'random_forest', distargs=distargs) bernoulli_id = CCTYPES.index('bernoulli') state.incorporate_dim( T[:,bernoulli_id], outputs=[191], cctype='bernoulli', v=state.Zv(cat_id)) state.update_cctype(191, 'random_forest', distargs={'k':2}) # Run valid transitions. state.transition( N=2, kernels=['rows','column_params','column_hypers'], views=[state.Zv(cat_id)], progress=False) # Running column transition should raise. with pytest.raises(ValueError): state.transition(N=1, kernels=['columns'], progress=False) # Updating cctype in singleton View should raise. distargs = DISTARGS[cat_id].copy() state.incorporate_dim( T[:,CCTYPES.index('categorical')], outputs=[98], cctype='categorical', distargs=distargs, v=max(state.views)+1) with pytest.raises(Exception): state.update_cctype(98, 'random_forest', distargs=distargs)
def test_incorporate_state(): state = State( T[:,:2], cctypes=CCTYPES[:2], distargs=DISTARGS[:2], rng=gu.gen_rng(0)) state.transition(N=5) target = state.views.keys()[0] # Incorporate a new dim into view[0]. state.incorporate_dim( T[:,2], outputs=[2], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) assert state.Zv(2) == target state.transition(N=1) # Incorporate a new dim into view[0] with a non-contiguous output. state.incorporate_dim( T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) assert state.Zv(10) == target state.transition(N=1) # Some crash testing queries. state.logpdf(-1, {10:1}, constraints={0:2, 1:1}) state.simulate(-1, [10], constraints={0:2}) # Incorporating with a duplicated output should raise. with pytest.raises(ValueError): state.incorporate_dim( T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) # Multivariate incorporate should raise. with pytest.raises(ValueError): state.incorporate_dim( T[:,2], outputs=[10, 2], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) # Missing output should raise. with pytest.raises(ValueError): state.incorporate_dim( T[:,2], outputs=[], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) # Wrong number of rows should raise. with pytest.raises(ValueError): state.incorporate_dim( T[:,2][:-1], outputs=[11], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) # Inputs should raise. with pytest.raises(ValueError): state.incorporate_dim( T[:,2], outputs=[11], inputs=[2], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target) # Incorporate dim into a newly created singleton view. target = max(state.views)+1 state.incorporate_dim( T[:,3], outputs=[3], cctype=CCTYPES[3], distargs=DISTARGS[3], v=target) assert state.Zv(3) == target state.transition(N=1) # Incorporate dim without specifying a view. state.incorporate_dim(T[:,4], outputs=[4], cctype=CCTYPES[4], distargs=DISTARGS[4]) state.transition(N=1) # Unincorporate first dim. previous = state.n_cols() state.unincorporate_dim(0) assert state.n_cols() == previous-1 state.transition(N=1) # Reincorporate dim without specifying a view. state.incorporate_dim( T[:,0], outputs=[0], cctype=CCTYPES[0], distargs=DISTARGS[0]) state.transition(N=1) # Incorporate dim into singleton view, remove it, assert destroyed. target = max(state.views)+1 state.incorporate_dim( T[:,5], outputs=[5], cctype=CCTYPES[5], distargs=DISTARGS[5], v=target) previous = len(state.views) state.unincorporate_dim(5) assert len(state.views) == previous-1 state.transition(N=1) # Reincorporate dim into a singleton view. target = max(state.views)+1 state.incorporate_dim(T[:,5], outputs=[5], cctype=CCTYPES[5], distargs=DISTARGS[5], v=target) state.transition(N=1) # Incorporate the rest of the dims in the default way. for i in xrange(6, len(CCTYPES)): state.incorporate_dim( T[:,i], outputs=[max(state.outputs)+1], cctype=CCTYPES[i], distargs=DISTARGS[i]) state.transition(N=1) # Unincorporating non-existent dim should raise. with pytest.raises(ValueError): state.unincorporate_dim(9999) # Unincorporate all the dims, except the last one. for o in state.outputs[:-1]: state.unincorporate_dim(o) assert state.n_cols() == 1 state.transition(N=1) # Unincorporating last dim should raise. with pytest.raises(ValueError): state.unincorporate_dim(state.outputs[0])
from cgpm.utils import general as gu from cgpm.utils import test as tu # Set up the data generation cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises' ]) T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) state = State(T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(312)) state.transition(N=10, progress=1) def test_crash_simulate_joint(state): state.simulate(-1, [0, 1, 2, 3, 4, 5, 6, 7, 8], N=10) def test_crash_logpdf_joint(state): state.logpdf(-1, { 0: 1, 1: 2, 2: 1, 3: 3, 4: 1, 5: 10, 6: .4,
def test_serialize_composite_cgpm(): rng = gu.gen_rng(2) # Generate the data. cctypes, distargs = cu.parse_distargs([ 'categorical(k=3)', # RandomForest 0 'normal', # LinearRegression 1 'categorical(k=3)', # GPMCC 2 'poisson', # GPMCC 3 'normal', # GPMCC 4 'lognormal' # GPMCC 5 ]) T, Zv, Zc = tu.gen_data_table( 35, [.4, .6], [[.33, .33, .34], [.5, .5]], cctypes, distargs, [.2]*len(cctypes), rng=rng) D = np.transpose(T) # Create GPMCC. state = State( D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:], distargs=distargs[2:], rng=rng) # Create a Forest. forest = RandomForest( outputs=[0], inputs=[1,2,3,4], distargs={ 'inputs': { 'stattypes': [cctypes[i] for i in [1,2,3,4]], 'statargs': [distargs[i] for i in [1,2,3,4]]}, 'k': distargs[0]['k']}, rng=rng) # Create a Regression. linreg = LinearRegression( outputs=[1], inputs=[3,4,5], distargs={ 'inputs': { 'stattypes': [cctypes[i] for i in [3,4,5]], 'statargs': [distargs[i] for i in [3,4,5]]}}, rng=rng) # Incorporate the data. def incorporate_data(cgpm, rowid, row): cgpm.incorporate( rowid, {i: row[i] for i in cgpm.outputs}, {i: row[i] for i in cgpm.inputs}, ) for rowid, row in enumerate(D): incorporate_data(forest, rowid, row) incorporate_data(linreg, rowid, row) # Compose the CGPMs. # Run state transitions. state.transition(N=10, progress=False) # Compose CGPMs, instructing State to run the transitions. token_forest = state.compose_cgpm(forest) token_linreg = state.compose_cgpm(linreg) state.transition_foreign(N=10, cols=[forest.outputs[0], linreg.outputs[0]]) # Now run the serialization. metadata = state.to_metadata() state2 = State.from_metadata(metadata) # Check that the tokens are in state2. assert token_forest in state2.hooked_cgpms assert token_linreg in state2.hooked_cgpms # The hooked cgpms must be unique objects after serialize/deserialize. assert state.hooked_cgpms[token_forest] != state2.hooked_cgpms[token_forest] assert state.hooked_cgpms[token_linreg] != state2.hooked_cgpms[token_linreg] # Check that the log scores of the hooked cgpms agree. assert np.allclose( state.hooked_cgpms[token_forest].logpdf_score(), state2.hooked_cgpms[token_forest].logpdf_score()) assert np.allclose( state.hooked_cgpms[token_linreg].logpdf_score(), state2.hooked_cgpms[token_linreg].logpdf_score()) # Now run some tests for the engine. e = Engine( D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:], distargs=distargs[2:], num_states=2, rng=rng) e.compose_cgpm([forest, forest], multiprocess=1) e.compose_cgpm([linreg, linreg], multiprocess=1) e.transition_foreign(N=1, cols=[forest.outputs[0], linreg.outputs[0]]) e.dependence_probability(0,1) e.simulate(-1, [0,1], {2:1}, multiprocess=0) e.logpdf(-1, {1:1}, {2:1, 0:0}, multiprocess=0) state3 = e.get_state(0) # There is no guarantee that the logpdf score improves with inference, but # it should reduce by more than a few nats. def check_logpdf_delta(before, after): return before < after or (after-before) < 5 check_logpdf_delta( before=state.hooked_cgpms[token_forest].logpdf_score(), after=state3.hooked_cgpms[token_forest].logpdf_score()) check_logpdf_delta( before=state.hooked_cgpms[token_linreg].logpdf_score(), after=state3.hooked_cgpms[token_linreg].logpdf_score())