Пример #1
0
def test_incorporate_session():
    rng = gu.gen_rng(4)
    state = State(X,
                  cctypes=['normal'] * 5,
                  Zv={
                      0: 0,
                      1: 0,
                      2: 1,
                      3: 1,
                      4: 2
                  },
                  rng=rng)
    # Incorporate row into a singleton cluster for all views.
    previous = [len(state.views[v].Nk()) for v in [0, 1, 2]]
    data = {i: rng.normal() for i in xrange(5)}
    clusters = {
        state.views[0].outputs[0]: previous[0],
        state.views[1].outputs[0]: previous[1],
        state.views[2].outputs[0]: previous[2],
    }
    state.incorporate(state.n_rows(), gu.merged(data, clusters))
    assert [len(state.views[v].Nk()) for v in [0,1,2]] == \
        [p+1 for p in previous]
    # Incorporate row without specifying clusters, and some missing values
    data = {i: rng.normal() for i in xrange(2)}
    state.incorporate(state.n_rows(), data)
    state.transition(N=3)
    # Remove the incorporated rowid.
    state.unincorporate(state.n_rows() - 1)
    state.transition(N=3)
Пример #2
0
def test_categorical_forest_manual_inputs_errors():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(1))
    state.transition(N=1, progress=False)
    cat_id = CCTYPES.index('categorical')

    # Put 1201 into the first view.
    view_idx = min(state.views)
    state.incorporate_dim(
        T[:,CCTYPES.index('categorical')], outputs=[1201],
        cctype='categorical', distargs=DISTARGS[cat_id], v=view_idx)

    # Updating cctype with completely invalid input should raise.
    with pytest.raises(Exception):
        distargs = DISTARGS[cat_id].copy()
        distargs['inputs'] = [10000]
        state.update_cctype(1201, 'random_forest', distargs=distargs)

    # Updating cctype with input dimensions outside the view should raise.
    cols_in_view = state.views[view_idx].dims.keys()
    cols_out_view = [c for c in state.outputs if c not in cols_in_view]
    assert len(cols_in_view) > 0 and len(cols_out_view) > 0
    with pytest.raises(Exception):
        distargs = DISTARGS[cat_id].copy()
        distargs['inputs'] = cols_out_view
        state.update_cctype(1201, 'random_forest', distargs=distargs)

    # Updating cctype with no input dimensions should raise.
    with pytest.raises(Exception):
        distargs = DISTARGS[cat_id].copy()
        distargs['inputs'] = []
        state.update_cctype(1201, 'random_forest', distargs=distargs)
Пример #3
0
def test_cmi_different_views__ci_():
    rng = gen_rng(0)
    T = np.zeros((50,3))
    T[:,0] = rng.normal(loc=-5, scale=1, size=50)
    T[:,1] = rng.normal(loc=2, scale=2, size=50)
    T[:,2] = rng.normal(loc=12, scale=3, size=50)
    state = State(
        T,
        outputs=[0, 1, 2],
        cctypes=['normal','normal','normal'],
        Zv={0:0, 1:1, 2:2},
        rng=rng
    )
    state.transition(N=30,
        kernels=['alpha','view_alphas','column_params','column_hypers','rows'])

    mi01 = state.mutual_information([0], [1])
    mi02 = state.mutual_information([0], [2])
    mi12 = state.mutual_information([1], [2])

    # Marginal MI all zero.
    assert np.allclose(mi01, 0)
    assert np.allclose(mi02, 0)
    assert np.allclose(mi12, 0)

    # CMI on variable in other view equal to MI.
    assert np.allclose(state.mutual_information([0], [1], {2:10}), mi01)
    assert np.allclose(state.mutual_information([0], [2], {1:0}), mi02)
    assert np.allclose(state.mutual_information([1], [2], {0:-2}), mi12)
    assert np.allclose(state.mutual_information([1], [2], {0:None}, T=5), mi12)
Пример #4
0
def test_poisson_categorical():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0))
    state.transition(N=1, progress=False)
    state.update_cctype(CCTYPES.index('categorical'), 'poisson')
    state.transition(N=1, progress=False)
    state.update_cctype(CCTYPES.index('categorical'), 'categorical',
        distargs={'k':2})
Пример #5
0
def test_naive_bayes_independence():
    rng = gu.gen_rng(1)
    D = rng.normal(size=(10, 1))
    T = np.repeat(D, 10, axis=1)
    Ci = list(itertools.combinations(range(10), 2))
    state = State(T, cctypes=['normal'] * 10, Ci=Ci, rng=rng)
    state.transition(N=10, progress=0)
    vu.validate_crp_constrained_partition(state.Zv(), [], Ci, {}, {})
Пример #6
0
def test_complex_independent_relationships():
    rng = gu.gen_rng(1)
    D = rng.normal(size=(10, 1))
    T = np.repeat(D, 10, axis=1)
    Ci = [(2, 8), (0, 3)]
    state = State(T, cctypes=['normal'] * 10, Ci=Ci, rng=rng)
    state.transition(N=10, progress=0)
    vu.validate_crp_constrained_partition(state.Zv(), [], Ci, {}, {})
Пример #7
0
def test_geometric_exponential():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0))
    state.transition(N=1, progress=False)
    state.update_cctype(CCTYPES.index('geometric'), 'exponential')
    state.transition(N=1, progress=False)

    # Incompatible numeric conversion.
    with pytest.raises(Exception):
        state.update_cctype(CCTYPES.index('exponential'), 'geometric')
Пример #8
0
def test_vonmises_normal():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0))
    state.transition(N=1, progress=False)
    state.update_cctype(CCTYPES.index('vonmises'), 'normal')
    state.transition(N=1, progress=False)
    state.update_cctype(CCTYPES.index('vonmises'), 'vonmises')

    # Incompatible numeric conversion.
    with pytest.raises(Exception):
        state.update_cctype(CCTYPES.index('normal'), 'vonmises')
Пример #9
0
def generate_gaussian_samples():
    state = State(D,
                  cctypes=['normal', 'normal'],
                  Zv={
                      0: 0,
                      1: 0
                  },
                  rng=gu.gen_rng(0))
    view = state.view_for(1)
    state.transition(S=15, kernels=['rows', 'column_params', 'column_hypers'])
    samples = view.simulate(-1, [0, 1, view.outputs[0]], N=100)
    return [replace_key(s, view.outputs[0], -1) for s in samples]
Пример #10
0
def init_view_state(data, iters, cctypes):
    if isinstance(data, list):
        data = np.array(data)
    D = len(data[0])
    outputs = range(D)
    X = {c: data[:, i].tolist() for i, c in enumerate(outputs)}
    view = View(X, cctypes=cctypes, outputs=[1000] + outputs, rng=RNG)
    state = State(data[:, 0:D], outputs=outputs, cctypes=cctypes, rng=RNG)
    if iters > 0:
        view.transition(iters)
        state.transition(iters)
    return view, state
Пример #11
0
def state():
    rng = gu.gen_rng(5)
    rows = 120
    cctypes = ['normal', 'bernoulli', 'normal']
    G = generate_quadrants(rows, rng)
    B, Zv, Zrv = tu.gen_data_table(rows, [1], [[.5, .5]], ['bernoulli'],
                                   [None], [.95],
                                   rng=rng)
    T = np.column_stack((G, B.T))[:, [0, 2, 1]]
    state = State(T, outputs=[0, 1, 2], cctypes=cctypes, rng=rng)
    state.transition(N=20)
    return state
Пример #12
0
def test_simple_dependence_constraint(Ci):
    rng = gu.gen_rng(1)
    D = rng.normal(size=(10, 1))
    T = np.repeat(D, 10, axis=1)
    Cd = [(2, 0), (8, 3)]
    state = State(T, cctypes=['normal'] * 10, Ci=Ci, Cd=Cd, rng=rng)
    with pytest.raises(ValueError):
        # Cannot transition columns with dependencies.
        state.transition(N=10, kernels=['columns'], progress=0)
    state.transition(
        N=10,
        kernels=['rows', 'alpha', 'column_hypers', 'alpha', 'view_alphas'],
        progress=False)
    vu.validate_crp_constrained_partition(state.Zv(), Cd, Ci, {}, {})
Пример #13
0
def generate_regression_samples():
    state = State(D,
                  cctypes=['normal', 'normal'],
                  Zv={
                      0: 0,
                      1: 0
                  },
                  rng=gu.gen_rng(4))
    view = state.view_for(1)
    assert not state._composite
    state.update_cctype(1, 'linear_regression')
    assert state._composite
    state.transition(S=30, kernels=['rows', 'column_params', 'column_hypers'])
    samples = view.simulate(-1, [0, 1, view.outputs[0]], N=100)
    return [replace_key(s, view.outputs[0], -1) for s in samples]
Пример #14
0
def test_linreg_missing_data_ignore():
    dataset = [
        [1, 3, 1],
        [2, 4, 1.5],
        [float('nan'), 5, 1]]
    state = State(dataset, cctypes=['normal']*3, Zv={0:0, 1:0, 2:0},
        rng=gu.gen_rng(1))
    # Make sure that missing covariates are handles as missing cell.
    state.update_cctype(2, 'linear_regression', distargs={'inputs': [0,1]})
    assert state.dim_for(2).inputs[1:] == [0,1]
    state.transition(N=5, kernels=['rows', 'column_hypers', 'view_alphas'])
    state.update_cctype(2, 'normal', distargs={'inputs': [0,1]})
    # Make sure that specified inputs are set correctly.
    state.update_cctype(2, 'linear_regression', distargs={'inputs': [1]})
    assert state.dim_for(2).inputs[1:] == [1]
Пример #15
0
def state():
    cctypes, distargs = cu.parse_distargs(
        ['categorical(k=5)', 'normal', 'poisson', 'bernoulli'])
    T, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(0))
    s = State(T.T,
              cctypes=cctypes,
              distargs=distargs,
              Zv={i: 0
                  for i in xrange(len(cctypes))},
              rng=gu.gen_rng(0))
    s.update_cctype(0, 'random_forest', distargs={'k': 5})
    # XXX Uncomment me for a bug!
    # state.update_cctype(1, 'linear_regression')
    kernels = [
        'rows', 'view_alphas', 'alpha', 'column_params', 'column_hypers'
    ]
    s.transition(N=1, kernels=kernels)
    return s
Пример #16
0
def test_categorical_forest():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(1))
    state.transition(N=1, progress=False)
    cat_id = CCTYPES.index('categorical')

    # If cat_id is singleton migrate first.
    if len(state.view_for(cat_id).dims) == 1:
        distargs = DISTARGS[cat_id].copy()
        state.unincorporate_dim(cat_id)
        state.incorporate_dim(
            T[:,cat_id], outputs=[cat_id], cctype='categorical',
            distargs=distargs, v=0)
    state.update_cctype(cat_id, 'random_forest', distargs=distargs)

    bernoulli_id = CCTYPES.index('bernoulli')
    state.incorporate_dim(
        T[:,bernoulli_id], outputs=[191], cctype='bernoulli',
        v=state.Zv(cat_id))
    state.update_cctype(191, 'random_forest', distargs={'k':2})

    # Run valid transitions.
    state.transition(
        N=2, kernels=['rows','column_params','column_hypers'],
        views=[state.Zv(cat_id)], progress=False)

    # Running column transition should raise.
    with pytest.raises(ValueError):
        state.transition(N=1, kernels=['columns'], progress=False)

    # Updating cctype in singleton View should raise.
    distargs = DISTARGS[cat_id].copy()
    state.incorporate_dim(
        T[:,CCTYPES.index('categorical')], outputs=[98],
        cctype='categorical', distargs=distargs, v=max(state.views)+1)
    with pytest.raises(Exception):
        state.update_cctype(98, 'random_forest', distargs=distargs)
Пример #17
0
def test_incorporate_state():
    state = State(
        T[:,:2], cctypes=CCTYPES[:2], distargs=DISTARGS[:2], rng=gu.gen_rng(0))
    state.transition(N=5)

    target = state.views.keys()[0]

    # Incorporate a new dim into view[0].
    state.incorporate_dim(
        T[:,2], outputs=[2], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target)
    assert state.Zv(2) == target
    state.transition(N=1)

    # Incorporate a new dim into view[0] with a non-contiguous output.
    state.incorporate_dim(
        T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2], v=target)
    assert state.Zv(10) == target
    state.transition(N=1)

    # Some crash testing queries.
    state.logpdf(-1, {10:1}, constraints={0:2, 1:1})
    state.simulate(-1, [10], constraints={0:2})

    # Incorporating with a duplicated output should raise.
    with pytest.raises(ValueError):
        state.incorporate_dim(
            T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2],
            v=target)

    # Multivariate incorporate should raise.
    with pytest.raises(ValueError):
        state.incorporate_dim(
            T[:,2], outputs=[10, 2], cctype=CCTYPES[2],
            distargs=DISTARGS[2], v=target)

    # Missing output should raise.
    with pytest.raises(ValueError):
        state.incorporate_dim(
            T[:,2], outputs=[], cctype=CCTYPES[2],
            distargs=DISTARGS[2], v=target)

    # Wrong number of rows should raise.
    with pytest.raises(ValueError):
        state.incorporate_dim(
            T[:,2][:-1], outputs=[11], cctype=CCTYPES[2],
            distargs=DISTARGS[2], v=target)

    # Inputs should raise.
    with pytest.raises(ValueError):
        state.incorporate_dim(
            T[:,2], outputs=[11], inputs=[2], cctype=CCTYPES[2],
            distargs=DISTARGS[2], v=target)

    # Incorporate dim into a newly created singleton view.
    target = max(state.views)+1
    state.incorporate_dim(
        T[:,3], outputs=[3], cctype=CCTYPES[3],
        distargs=DISTARGS[3], v=target)
    assert state.Zv(3) == target
    state.transition(N=1)

    # Incorporate dim without specifying a view.
    state.incorporate_dim(T[:,4], outputs=[4],
        cctype=CCTYPES[4], distargs=DISTARGS[4])
    state.transition(N=1)

    # Unincorporate first dim.
    previous = state.n_cols()
    state.unincorporate_dim(0)
    assert state.n_cols() == previous-1
    state.transition(N=1)

    # Reincorporate dim without specifying a view.
    state.incorporate_dim(
        T[:,0], outputs=[0], cctype=CCTYPES[0], distargs=DISTARGS[0])
    state.transition(N=1)

    # Incorporate dim into singleton view, remove it, assert destroyed.
    target = max(state.views)+1
    state.incorporate_dim(
        T[:,5], outputs=[5], cctype=CCTYPES[5], distargs=DISTARGS[5],
        v=target)
    previous = len(state.views)
    state.unincorporate_dim(5)
    assert len(state.views) == previous-1
    state.transition(N=1)

    # Reincorporate dim into a singleton view.
    target = max(state.views)+1
    state.incorporate_dim(T[:,5], outputs=[5], cctype=CCTYPES[5],
        distargs=DISTARGS[5], v=target)
    state.transition(N=1)

    # Incorporate the rest of the dims in the default way.
    for i in xrange(6, len(CCTYPES)):
        state.incorporate_dim(
            T[:,i], outputs=[max(state.outputs)+1],
            cctype=CCTYPES[i], distargs=DISTARGS[i])
    state.transition(N=1)

    # Unincorporating non-existent dim should raise.
    with pytest.raises(ValueError):
        state.unincorporate_dim(9999)

    # Unincorporate all the dims, except the last one.
    for o in state.outputs[:-1]:
        state.unincorporate_dim(o)
    assert state.n_cols() == 1
    state.transition(N=1)

    # Unincorporating last dim should raise.
    with pytest.raises(ValueError):
        state.unincorporate_dim(state.outputs[0])
Пример #18
0
from cgpm.utils import general as gu
from cgpm.utils import test as tu

# Set up the data generation
cctypes, distargs = cu.parse_distargs([
    'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal',
    'exponential', 'beta', 'geometric', 'vonmises'
])

T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]],
                              cctypes,
                              distargs, [.95] * len(cctypes),
                              rng=gu.gen_rng(10))

state = State(T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(312))
state.transition(N=10, progress=1)


def test_crash_simulate_joint(state):
    state.simulate(-1, [0, 1, 2, 3, 4, 5, 6, 7, 8], N=10)


def test_crash_logpdf_joint(state):
    state.logpdf(-1, {
        0: 1,
        1: 2,
        2: 1,
        3: 3,
        4: 1,
        5: 10,
        6: .4,
Пример #19
0
def test_serialize_composite_cgpm():
    rng = gu.gen_rng(2)

    # Generate the data.
    cctypes, distargs = cu.parse_distargs([
        'categorical(k=3)',     # RandomForest          0
        'normal',               # LinearRegression      1
        'categorical(k=3)',     # GPMCC                 2
        'poisson',              # GPMCC                 3
        'normal',               # GPMCC                 4
        'lognormal'             # GPMCC                 5
        ])
    T, Zv, Zc = tu.gen_data_table(
        35, [.4, .6], [[.33, .33, .34], [.5, .5]],
        cctypes, distargs, [.2]*len(cctypes), rng=rng)
    D = np.transpose(T)

    # Create GPMCC.
    state = State(
        D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:],
        distargs=distargs[2:], rng=rng)

    # Create a Forest.
    forest = RandomForest(
        outputs=[0],
        inputs=[1,2,3,4],
        distargs={
            'inputs': {
                'stattypes': [cctypes[i] for i in [1,2,3,4]],
                'statargs': [distargs[i] for i in [1,2,3,4]]},
            'k': distargs[0]['k']},
        rng=rng)

    # Create a Regression.
    linreg = LinearRegression(
        outputs=[1],
        inputs=[3,4,5],
        distargs={
            'inputs': {
                'stattypes': [cctypes[i] for i in [3,4,5]],
                'statargs': [distargs[i] for i in [3,4,5]]}},
        rng=rng)

    # Incorporate the data.
    def incorporate_data(cgpm, rowid, row):
        cgpm.incorporate(
            rowid,
            {i: row[i] for i in cgpm.outputs},
            {i: row[i] for i in cgpm.inputs},
        )
    for rowid, row in enumerate(D):
        incorporate_data(forest, rowid, row)
        incorporate_data(linreg, rowid, row)

    # Compose the CGPMs.

    # Run state transitions.
    state.transition(N=10, progress=False)
    # Compose CGPMs, instructing State to run the transitions.
    token_forest = state.compose_cgpm(forest)
    token_linreg = state.compose_cgpm(linreg)
    state.transition_foreign(N=10, cols=[forest.outputs[0], linreg.outputs[0]])

    # Now run the serialization.
    metadata = state.to_metadata()
    state2 = State.from_metadata(metadata)

    # Check that the tokens are in state2.
    assert token_forest in state2.hooked_cgpms
    assert token_linreg in state2.hooked_cgpms

    # The hooked cgpms must be unique objects after serialize/deserialize.
    assert state.hooked_cgpms[token_forest] != state2.hooked_cgpms[token_forest]
    assert state.hooked_cgpms[token_linreg] != state2.hooked_cgpms[token_linreg]

    # Check that the log scores of the hooked cgpms agree.
    assert np.allclose(
        state.hooked_cgpms[token_forest].logpdf_score(),
        state2.hooked_cgpms[token_forest].logpdf_score())
    assert np.allclose(
        state.hooked_cgpms[token_linreg].logpdf_score(),
        state2.hooked_cgpms[token_linreg].logpdf_score())

    # Now run some tests for the engine.
    e = Engine(
        D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:],
        distargs=distargs[2:], num_states=2, rng=rng)
    e.compose_cgpm([forest, forest], multiprocess=1)
    e.compose_cgpm([linreg, linreg], multiprocess=1)
    e.transition_foreign(N=1, cols=[forest.outputs[0], linreg.outputs[0]])
    e.dependence_probability(0,1)
    e.simulate(-1, [0,1], {2:1}, multiprocess=0)
    e.logpdf(-1, {1:1}, {2:1, 0:0}, multiprocess=0)

    state3 = e.get_state(0)

    # There is no guarantee that the logpdf score improves with inference, but
    # it should reduce by more than a few nats.
    def check_logpdf_delta(before, after):
        return before < after or (after-before) < 5
    check_logpdf_delta(
        before=state.hooked_cgpms[token_forest].logpdf_score(),
        after=state3.hooked_cgpms[token_forest].logpdf_score())
    check_logpdf_delta(
        before=state.hooked_cgpms[token_linreg].logpdf_score(),
        after=state3.hooked_cgpms[token_linreg].logpdf_score())