예제 #1
0
def test_alpha_numeric():
    docs = [list('abcd'), list('cdef')]
    defn = model_definition(len(docs), v=6)
    prng = rng()
    s = initialize(defn, docs, prng)
    assert_equals(s.nentities(), len(docs))
    assert_equals(s.nwords(), 6)
예제 #2
0
파일: test_runner.py 프로젝트: jzf2101/irm
def _test_runner_simple(defn, kc_fn):
    views = map(numpy_dataview, toy_dataset(defn))
    kc = kc_fn(defn)
    prng = rng()
    latent = model.initialize(defn, views, prng)
    r = runner.runner(defn, views, latent, kc)
    r.run(prng, 10)
예제 #3
0
def test_slice_theta_mm():
    N = 100
    data = np.array(
        [(np.random.random() < 0.8,) for _ in xrange(N)],
        dtype=[('', bool)])
    defn = model_definition(N, [bbnc])
    r = rng()
    prior = {'alpha': 1.0, 'beta': 9.0}
    view = numpy_dataview(data)
    s = initialize(
        defn,
        view,
        cluster_hp={'alpha': 1., 'beta': 9.},
        feature_hps=[prior],
        r=r,
        assignment=[0] * N)

    heads = len([1 for y in data if y[0]])
    tails = N - heads

    alpha1 = prior['alpha'] + heads
    beta1 = prior['beta'] + tails

    bs = bind(s, view)
    params = {0: {'p': 0.05}}

    def sample_fn():
        theta(bs, r, tparams=params)
        return s.get_suffstats(0, 0)['p']

    rv = beta(alpha1, beta1)
    assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
예제 #4
0
def test_posterior_predictive_statistic():
    N, D = 10, 4  # D needs to be even
    defn = model_definition(N, [bb] * D)
    Y = toy_dataset(defn)
    prng = rng()
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
    q = ma.masked_array(
        np.array([(False,) * D], dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)])

    statistic = query.posterior_predictive_statistic(q, latents, prng)
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    statistic = query.posterior_predictive_statistic(
        q, latents, prng, merge='mode')
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    statistic = query.posterior_predictive_statistic(
        q, latents, prng, merge=['mode', 'mode', 'avg', 'avg'])
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    q = ma.masked_array(
        np.array([(False,) * D] * 3, dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3)
    statistic = query.posterior_predictive_statistic(q, latents, prng)
    assert_equals(statistic.shape, (3,))
    assert_equals(len(statistic.dtype), D)
예제 #5
0
def test_cant_serialize():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    prng = rng()
    s = initialize(defn, data, prng)
    s.serialize()
예제 #6
0
def test_runner_multiprocessing_convergence():
    N, D = 4, 5
    defn = model_definition(N, [bb] * D)
    prng = rng()
    Y, posterior = data_with_posterior(defn, r=prng)
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng)
               for _ in xrange(mp.cpu_count())]
    runners = [runner.runner(defn, view, latent, ['assign'])
               for latent in latents]
    r = parallel.runner(runners)
    r.run(r=prng, niters=1000)  # burnin
    idmap = {C: i for i, C in enumerate(permutation_iter(N))}

    def sample_iter():
        r.run(r=prng, niters=10)
        for latent in r.get_latents():
            yield idmap[tuple(permutation_canonical(latent.assignments()))]

    ref = [None]

    def sample_fn():
        if ref[0] is None:
            ref[0] = sample_iter()
        try:
            return next(ref[0])
        except StopIteration:
            ref[0] = None
        return sample_fn()

    assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
예제 #7
0
def test_slice_theta_irm():
    N = 10
    defn = model_definition([N], [((0, 0), bbnc)])
    data = np.random.random(size=(N, N)) < 0.8
    view = numpy_dataview(data)
    r = rng()
    prior = {'alpha': 1.0, 'beta': 9.0}

    s = initialize(
        defn,
        [view],
        r=r,
        cluster_hps=[{'alpha': 2.0}],
        relation_hps=[prior],
        domain_assignments=[[0] * N])

    bs = bind(s, 0, [view])

    params = {0: {'p': 0.05}}

    heads = len([1 for y in data.flatten() if y])
    tails = len([1 for y in data.flatten() if not y])

    alpha1 = prior['alpha'] + heads
    beta1 = prior['beta'] + tails

    def sample_fn():
        theta(bs, r, tparams=params)
        return s.get_suffstats(0, [0, 0])['p']

    rv = beta(alpha1, beta1)
    assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
예제 #8
0
파일: test_runner.py 프로젝트: jzf2101/irm
def test_runner_multiprocessing_convergence():
    domains = [4]
    defn = model_definition(domains, [((0, 0), bb)])
    prng = rng()
    relations, posterior = data_with_posterior(defn, prng)
    views = map(numpy_dataview, relations)
    latents = [model.initialize(defn, views, prng)
               for _ in xrange(mp.cpu_count())]
    kc = [('assign', range(len(domains)))]
    runners = [runner.runner(defn, views, latent, kc) for latent in latents]
    r = parallel.runner(runners)
    r.run(r=prng, niters=10000)  # burnin
    product_assignments = tuple(map(list, map(permutation_iter, domains)))
    idmap = {C: i for i, C in enumerate(it.product(*product_assignments))}

    def sample_iter():
        r.run(r=prng, niters=10)
        for latent in r.get_latents():
            key = tuple(tuple(permutation_canonical(latent.assignments(i)))
                        for i in xrange(len(domains)))
            yield idmap[key]

    ref = [None]

    def sample_fn():
        if ref[0] is None:
            ref[0] = sample_iter()
        try:
            return next(ref[0])
        except StopIteration:
            ref[0] = None
        return sample_fn()

    assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
def test_cxx_sample_post_pred_given_data():
    assert D == 5
    y_new = ma.masked_array(
        np.array([(True, False, True, True, True)], dtype=[('', np.bool)] * 5),
        mask=[(False, False, True, True, True)])[0]
    _test_sample_post_pred(
        cxx_initialize, cxx_numpy_dataview, y_new, rng(543234))
예제 #10
0
def data_with_posterior(defn,
                        cluster_hp=None,
                        feature_hps=None,
                        preprocess_data_fn=None,
                        r=None):
    # XXX(stephentu): should only accept conjugate models
    if r is None:
        r = rng()
    Y_clusters, _ = sample(defn, cluster_hp, feature_hps, r)
    Y = np.hstack(Y_clusters)
    if preprocess_data_fn:
        Y = preprocess_data_fn(Y)
    data = numpy_dataview(Y)

    def score_fn(assignment):
        s = initialize(defn,
                       data,
                       r,
                       cluster_hp=cluster_hp,
                       feature_hps=feature_hps,
                       assignment=assignment)
        return s.score_joint(r)

    posterior = dist_on_all_clusterings(score_fn, defn.n())
    return Y, posterior
def test_multivariate_models_cxx():
    _test_multivariate_models(
        initialize,
        numpy_dataview,
        bind,
        gibbs_assign,
        rng())
def _test_convergence_bb_cxx(N,
                             D,
                             kernel,
                             preprocess_data_fn=None,
                             nonconj=False,
                             burnin_niters=10000,
                             skip=10,
                             ntries=50,
                             nsamples=1000,
                             kl_places=2):
    r = rng()
    cluster_hp = {'alpha': 2.0}
    feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D
    defn = model_definition(N, [bb] * D)
    nonconj_defn = model_definition(N, [bbnc] * D)
    Y, posterior = data_with_posterior(
        defn, cluster_hp, feature_hps, preprocess_data_fn)
    data = numpy_dataview(Y)
    s = initialize(nonconj_defn if nonconj else defn,
                   data,
                   cluster_hp=cluster_hp,
                   feature_hps=feature_hps,
                   r=r)
    bs = bind(s, data)
    wrapped_kernel = lambda s: kernel(s, r)
    _test_convergence(bs,
                      posterior,
                      wrapped_kernel,
                      burnin_niters,
                      skip,
                      ntries,
                      nsamples,
                      kl_places)
예제 #13
0
파일: test_state.py 프로젝트: mrG7/lda
def test_explicit_exceptions():
    """ValueError should be rasied for bad assignments
    """
    prng = rng()
    N, V = 3, 7
    defn = model_definition(N, V)
    data = [[0, 1, 2, 3], [0, 1, 4], [0, 1, 5, 6]]

    # We should get an error if we leave out a dish assignment for a given table
    table_assignments = [[1, 2, 1, 2], [1, 1, 1], [3, 3, 3, 1]]
    dish_assignments = [[0, 1, 2], [0, 3], [0, 1, 2]]

    assert_raises(ValueError,
                  initialize,
                  defn, data,
                  table_assignments=table_assignments,
                  dish_assignments=dish_assignments)

    # We should get an error if we leave out a table assignment for a given word
    table_assignments = [[1, 2, 1, 2], [1, 1, 1], [3, 3, 3]]
    dish_assignments = [[0, 1, 2], [0, 3], [0, 1, 2, 1]]

    assert_raises(ValueError,
                  initialize,
                  defn, data,
                  table_assignments=table_assignments,
                  dish_assignments=dish_assignments)
예제 #14
0
def test_dense_vs_sparse():
    # XXX: really belongs in irm test cases, but kernels has a nice cluster
    # enumeration iterator

    r = rng()

    n = 5
    raw = ma.array(
        np.random.choice(np.arange(20), size=(n, n)),
        mask=np.random.choice([False, True], size=(n, n)))

    dense = [relation_numpy_dataview(raw)]
    sparse = [sparse_relation_dataview(_tocsr(raw))]

    domains = [n]
    relations = [((0, 0), gp)]
    defn = irm_definition(domains, relations)

    def score_fn(data):
        def f(assignments):
            s = irm_initialize(defn, data, r=r, domain_assignments=assignments)
            assign = sum(s.score_assignment(i)
                         for i in xrange(len(assignments)))
            likelihood = s.score_likelihood(r)
            return assign + likelihood
        return f

    product_assignments = tuple(map(list, map(permutation_iter, domains)))

    dense_posterior = scores_to_probs(
        np.array(map(score_fn(dense), it.product(*product_assignments))))
    sparse_posterior = scores_to_probs(
        np.array(map(score_fn(sparse), it.product(*product_assignments))))

    assert_1d_lists_almost_equals(dense_posterior, sparse_posterior, places=3)
예제 #15
0
def test_kernel_gibbs_hp():
    _test_kernel_gibbs_hp(initialize,
                          numpy_dataview,
                          bind,
                          gibbs_hp,
                          'grid_gibbs_hp_samples_pdf',
                          rng())
예제 #16
0
파일: test_state.py 프로젝트: jzf2101/lda
def test_simple():
    N, V = 10, 100
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = numpy_dataview(data)
    R = rng()
    s = initialize(defn, view, R)
    assert_equals(s.nentities(), len(data))
예제 #17
0
def test_simple():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    s = initialize(defn, view, prng)
    assert_equals(s.nentities(), len(data))
예제 #18
0
def test_multi_dish_initialization():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    s = initialize(defn, view, prng, initial_dishes=V)
    assert_true(s.ntopics() > 1)
예제 #19
0
def test_single_dish_initialization():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    s = initialize(defn, view, prng, initial_dishes=1)
    assert_equals(s.ntopics(), 0) # Only dummy topic
예제 #20
0
def test_state_pickle():
    defn = model_definition([5], [((0, 0), bb)])
    r = rng()
    relations = toy_dataset(defn)
    views = map(numpy_dataview, relations)
    s1 = model.initialize(defn, views, r)
    s2 = pickle.loads(pickle.dumps(s1))
    _assert_structure_equals(defn, s1, s2, views, r)
예제 #21
0
파일: test_sampler.py 프로젝트: jzf2101/lda
def test_convergence_simple():
    N, V = 2, 10
    defn = model_definition(N, V)
    data = [
        np.array([5, 6]),
        np.array([0, 1, 2]),
    ]
    view = numpy_dataview(data)
    prng = rng()

    scores = []
    idmap = {}
    for i, (tables, dishes) in enumerate(permutations([2, 3])):
        latent = model.initialize(
            defn, view, prng,
            table_assignments=tables,
            dish_assignments=dishes)
        scores.append(
            latent.score_assignment() +
            latent.score_data(prng))
        idmap[(tables, dishes)] = i
    true_dist = scores_to_probs(scores)

    def kernel(latent):
        # mutates latent in place
        doc_model = model.bind(latent, data=view)
        kernels.assign2(doc_model, prng)
        for did in xrange(latent.nentities()):
            table_model = model.bind(latent, document=did)
            kernels.assign(table_model, prng)

    latent = model.initialize(defn, view, prng)

    skip = 10
    def sample_fn():
        for _ in xrange(skip):
            kernel(latent)
        table_assignments = latent.table_assignments()
        canon_table_assigments = tuple(
            map(tuple, map(permutation_canonical, table_assignments)))

        dish_maps = latent.dish_assignments()
        dish_assignments = []
        for dm, (ta, ca) in zip(dish_maps, zip(table_assignments, canon_table_assigments)):
            dish_assignment = []
            for t, c in zip(ta, ca):
                if c == len(dish_assignment):
                    dish_assignment.append(dm[t])
            dish_assignments.append(dish_assignment)

        canon_dish_assigments = tuple(
            map(tuple, map(permutation_canonical, dish_assignments)))

        return idmap[(canon_table_assigments, canon_dish_assigments)]

    assert_discrete_dist_approx(
        sample_fn, true_dist,
        ntries=100, nsamples=10000, kl_places=2)
예제 #22
0
def test_runner_simple():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    latent = model.initialize(defn, view, prng)
    r = runner.runner(defn, view, latent)
    r.run(prng, 1)
예제 #23
0
def test_zmatrix():
    N, D = 10, 4
    defn = model_definition(N, [bb] * D)
    Y = toy_dataset(defn)
    prng = rng()
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
    zmat = query.zmatrix(latents)
    assert_equals(zmat.shape, (N, N))
예제 #24
0
def _test_convergence(domains,
                      data,
                      reg_relations,
                      brute_relations,
                      kernel,
                      burnin_niters=10000,
                      skip=10,
                      ntries=50,
                      nsamples=1000,
                      places=2):
    r = rng()

    reg_defn = irm_definition(domains, reg_relations)
    brute_defn = irm_definition(domains, brute_relations)

    def score_fn(assignments):
        s = irm_initialize(
            brute_defn, data, r=r,
            domain_assignments=assignments)
        assign = sum(s.score_assignment(i) for i in xrange(len(assignments)))
        likelihood = s.score_likelihood(r)
        return assign + likelihood
    product_assignments = tuple(map(list, map(permutation_iter, domains)))
    posterior = scores_to_probs(
        np.array(map(score_fn, it.product(*product_assignments))))

    s = irm_initialize(reg_defn, data, r=r)
    bounded_states = [irm_bind(s, i, data) for i in xrange(len(domains))]

    # burnin
    start = time.time()
    last = start
    for i in xrange(burnin_niters):
        for bs in bounded_states:
            kernel(bs, r)
        if not ((i + 1) % 1000):
            print 'burning finished iteration', (i + 1), \
                'in', (time.time() - last), 'seconds'
            last = time.time()
    print 'finished burnin of', burnin_niters, \
        'iters in', (time.time() - start), 'seconds'

    idmap = {C: i for i, C in enumerate(it.product(*product_assignments))}
    #print idmap

    def sample_fn():
        for _ in xrange(skip):
            for bs in bounded_states:
                kernel(bs, r)
        key = tuple(tuple(permutation_canonical(bs.assignments()))
                    for bs in bounded_states)
        return idmap[key]

    assert_discrete_dist_approx(
        sample_fn, posterior,
        ntries=ntries, nsamples=nsamples,
        kl_places=places)
예제 #25
0
def test_runner_specify_basic_kernel():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    latent = model.initialize(defn, view, prng)
    r = runner.runner(defn, view, latent, ["crf"])
    r.run(prng, 1)
예제 #26
0
파일: test_state.py 프로젝트: mrG7/lda
def test_serialize_simple():
    N, V = 10, 20
    defn = model_definition(N, V)
    data = toy_dataset(defn)
    view = data
    prng = rng()
    s = initialize(defn, view, prng)
    m = s.serialize()
    s2 = deserialize(defn, m)
    assert s2.__class__ == s.__class__
예제 #27
0
 def test_lda_zero_iter(self):
     # compare to model with 0 iterations
     prng2 = rng(seed=54321)
     latent2 = model.initialize(self.defn, self.docs, prng2)
     assert latent2 is not None
     r2 = runner.runner(self.defn, self.docs, latent2)
     assert r2 is not None
     doc_topic2 = latent2.topic_distribution_by_document()
     assert doc_topic2 is not None
     assert latent2.perplexity() > self.latent.perplexity()
예제 #28
0
def test_operations():
    N = 10
    R = rng(12)

    def mkrow():
        return (np.random.choice([False, True]),
                np.random.choice([False, True]),
                np.random.random(),
                np.random.choice([False, True]))
    dtype = [('', bool), ('', bool), ('', float), ('', bool)]
    # non-masked data
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bb, nich, bb])
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # *_initialize() randomly assigns all entities to a group, so we'll have to
    # unset this assignment for this test
    unset(cxx_s, data, R)

    ensure_k_groups(cxx_s, 3, R)

    assert cxx_s.nentities() == N

    cxx_s.dcheck_consistency()

    assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in it.islice(enumerate(data), 2):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()

    newrow = mkrow()
    newdata = np.array([newrow], dtype=dtype)

    cxx_score = cxx_s.score_value(newdata[0], R)
    assert cxx_score is not None
    cxx_s.dcheck_consistency()
예제 #29
0
def test_simple():
    domains = [5, 6]

    relations = [((0, 1), bb)]

    relsize = (domains[0], domains[1])
    raw_data = [
        ma.array(np.random.choice([False, True], size=relsize), mask=np.random.choice([False, True], size=relsize))
    ]

    def csr(raw):
        n, m = raw.shape

        def indices():
            for i, j in it.product(range(n), range(m)):
                if not raw.mask[i, j]:
                    yield i, j

        data = [raw[i, j] for i, j in indices()]
        i = list(map(op.itemgetter(0), indices()))
        j = list(map(op.itemgetter(1), indices()))
        return coo_matrix((data, (i, j)), shape=raw.shape).tocsr()

    defn = model_definition(domains, relations)
    data = map(numpy_dataview, raw_data)
    sparse_data = map(sparse_2d_dataview, map(csr, raw_data))

    r = rng()

    s = initialize(defn, data, r=r)
    assert s and bind(s, 0, data) and bind(s, 1, data)

    s1 = initialize(defn, sparse_data, r=r)
    assert s1 and bind(s1, 0, sparse_data) and bind(s1, 1, sparse_data)

    def entity_data_positions(domain, eid):
        def f(domains, reln):
            for pos0 in xrange(reln.shape[0]):
                for pos1 in xrange(reln.shape[1]):
                    if reln.mask[pos0, pos1]:
                        continue
                    if (domains[0] == domain and pos0 == eid) or (domains[1] == domain and pos1 == eid):
                        yield [pos0, pos1]

        return list(it.chain.from_iterable(f(domains, reln) for (domains, _), reln in zip(relations, raw_data)))

    def test(s):
        for did, nentities in enumerate(domains):
            for eid in xrange(nentities):
                a = entity_data_positions(did, eid)
                b = s.entity_data_positions(did, eid, data)
                assert sorted(a) == sorted(b)

    test(s)
    test(s1)
예제 #30
0
def test_sample_post_pred():
    N = 10
    R = rng(5483932)
    D = 4

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return tuple(randombool() for _ in xrange(D))
    dtype = [('', bool)] * D
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb] * D)
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    G = 3
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % G
        cxx_s.add_value(egid, i, yi, R)

    # sample
    y_new_data = mkrow()
    y_new_mask = tuple(randombool() for _ in xrange(D))
    y_new = ma.masked_array(
        np.array([y_new_data], dtype=dtype),
        mask=[y_new_mask])[0]

    n_samples = 1000

    cxx_samples = np.hstack(
        [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])

    idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}

    def todist(samples):
        dist = np.zeros(len(idmap))
        for s in samples:
            dist[idmap[tuple(s)]] += 1.0
        dist /= dist.sum()
        return dist

    cxx_dist = todist(cxx_samples)
    assert cxx_dist is not None
예제 #31
0
def test_kernel_gibbs_hp():
    _test_kernel_gibbs_hp(initialize, numpy_dataview, bind, gibbs_hp,
                          'grid_gibbs_hp_samples_pdf', rng())
예제 #32
0
파일: bench.py 프로젝트: pschulam/kernels
def bench(args, latent_fn):
    parser = argparse.ArgumentParser()
    parser.add_argument('--groups', type=int, action='append')
    parser.add_argument('--entities-per-group', type=int, action='append')
    parser.add_argument('--features', type=int, action='append')
    parser.add_argument('--target-runtime', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    args = parser.parse_args(args)

    print args

    if not args.groups or not len(args.groups):
        raise ValueError("need to specify >= 1 --groups")
    if len(args.groups) == 1:
        print "WARNING: one group will make very uninteresting graphs"
    for groups in args.groups:
        if groups <= 0:
            raise ValueError('need positive groups')

    if not args.entities_per_group or not len(args.entities_per_group):
        raise ValueError("need to specify >= 1 --entities-per-group")
    for entities_per_group in args.entities_per_group:
        if entities_per_group <= 0:
            raise ValueError('need positive entities_per_group')

    if not args.features or not len(args.features):
        raise ValueError("need to specify >= 1 --features")
    for features in args.features:
        if features <= 0:
            raise ValueError('need positive features')

    if args.target_runtime <= 0:
        raise ValueError("--target-runtime needs to be >= 0")

    vs = versions()
    vstr = 'c{}-m{}-k{}'.format(vs['common'], vs['mixturemodel'],
                                vs['kernels'])
    print 'vstr:', vstr

    r = rng()

    target_runtime = args.target_runtime
    results = []
    grid = it.product(args.groups, args.entities_per_group, args.features)
    for groups, entities_per_group, features in grid:
        start = time.time()
        latent = latent_fn(groups, entities_per_group, features, r)
        results.append(measure(groups, target_runtime, latent, r))
        print 'finished ({}, {}, {}) in {} seconds'.format(
            groups, entities_per_group, features,
            time.time() - start)

    output = {
        'args': args.__dict__,
        'versions': vs,
        'cpuinfo': cpuinfo.get_cpu_info(),
        'results': results,
        'time': datetime.now().isoformat(),
    }

    with open(args.output, 'w') as fp:
        json.dump(output, fp)
예제 #33
0
def test_import_rng():
    from microscopes.common.rng import rng
    r = rng(12345)
    assert r
예제 #34
0
def test_betabin_equiv():

    # https://github.com/pymc-devs/pymc/blob/
    # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84
    def betabin_like(value, alpha, beta, n):
        return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) +
                gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) +
                gammaln(alpha + value) + gammaln(n + beta - value) -
                gammaln(beta + alpha + n))

    # this N refers to the number of trials in the binomial distribution
    N = 10

    # this refers to the dataset size
    M = 100

    # hyperparams of the beta dist
    alpha, beta = 1., 2.

    heads = np.random.randint(low=0, high=N + 1, size=M)
    tails = N - heads

    data = np.vstack((heads, tails)).T

    Y = np.array([(y, ) for y in data], dtype=[('', np.int, (2, ))])
    view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(2)])
    prior = {'alphas': [alpha, beta]}
    s = cxx_initialize(defn,
                       view,
                       r,
                       feature_hps=[prior],
                       assignment=[0] * Y.shape[0])

    assert_equals(s.groups(), [0])

    def all_indices(N):
        for i, j in it.product(range(0, N + 1), repeat=2):
            if (i + j) == N:
                yield i, j

    all_data = [(list(ij), ) for ij in all_indices(N)]

    Y_test = np.array(all_data, dtype=[('', np.int, (2, ))])

    # the actual score is simply a betabin using the updated alpha, beta
    alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0)

    def model_score(Y_value):
        _, (score, ) = s.score_value(Y_value, r)
        return score

    def test_score(Y_value):
        score = betabin_like(Y_value[0][0], alpha1, beta1, N)
        return score

    model_scores = np.array(map(model_score, Y_test))
    test_scores = np.array(map(test_score, Y_test))

    assert_almost_equals(np.exp(model_scores).sum(), 1., places=2)
    assert_almost_equals(np.exp(test_scores).sum(), 1., places=2)
    assert_almost_equals(np.abs(model_scores - test_scores).max(),
                         0.,
                         places=1)
예제 #35
0
def test_stress_cxx():
    _test_stress(cxx_initialize, cxx_numpy_dataview, rng())
예제 #36
0
from microscopes.common.rng import rng
from microscopes.common.relation.dataview import numpy_dataview
from microscopes.models import bb as beta_bernoulli
from microscopes.irm.definition import model_definition
from microscopes.irm import model, runner, query
from microscopes.kernels import parallel
from microscopes.common.query import groups, zmatrix_heuristic_block_ordering, zmatrix_reorder

# ##Let's start by defining the model and loading the data

# In[6]:

defn = model_definition([N], [((0, 0), beta_bernoulli)])
views = [numpy_dataview(communications_relation)]
prng = rng()

# ##Next, let's initialize the model and define the runners.
#
# ##These runners are our MCMC chains. We'll use `cpu_count` to define our number of chains.

# In[ ]:

nchains = cpu_count()
latents = [
    model.initialize(defn, views, r=prng, cluster_hps=[{
        'alpha': 1e-3
    }]) for _ in xrange(nchains)
]
kc = runner.default_assign_kernel_config(defn)
runners = [runner.runner(defn, views, latent, kc) for latent in latents]
예제 #37
0
def infinite_relational_model(corr_matrix, lag_matrix, threshold, sampled_coords, window_size):
    import numpy as np
    import math
    import json
    import time
    import itertools as it
    from multiprocessing import cpu_count
    from microscopes.common.rng import rng
    from microscopes.common.relation.dataview import numpy_dataview
    from microscopes.models import bb as beta_bernoulli
    from microscopes.irm.definition import model_definition
    from microscopes.irm import model, runner, query
    from microscopes.kernels import parallel
    from microscopes.common.query import groups, zmatrix_heuristic_block_ordering, zmatrix_reorder

    cluster_matrix = []
    graph = []

    # calculate graph
    for row in corr_matrix:
        graph_row = []
        for corr in row:
            if corr < threshold:
                graph_row.append(False)
            else:
                graph_row.append(True)

        graph.append(graph_row)

    graph = np.array(graph, dtype=np.bool)

    graph_size = len(graph)

    # conduct Infinite Relational Model
    defn = model_definition([graph_size], [((0, 0), beta_bernoulli)])
    views = [numpy_dataview(graph)]
    prng = rng()

    nchains = cpu_count()
    latents = [model.initialize(defn, views, r=prng, cluster_hps=[{'alpha':1e-3}]) for _ in xrange(nchains)]
    kc = runner.default_assign_kernel_config(defn)
    runners = [runner.runner(defn, views, latent, kc) for latent in latents]
    r = parallel.runner(runners)

    start = time.time()
    # r.run(r=prng, niters=1000)
    # r.run(r=prng, niters=100)
    r.run(r=prng, niters=20)
    print ("inference took", time.time() - start, "seconds")

    infers = r.get_latents()
    clusters = groups(infers[0].assignments(0), sort=True)
    ordering = list(it.chain.from_iterable(clusters))

    z = graph.copy()
    z = z[ordering]
    z = z[:,ordering]

    corr_matrix = corr_matrix[ordering]
    corr_matrix = corr_matrix[:,ordering]

    lag_matrix = lag_matrix[ordering]
    lag_matrix = lag_matrix[:,ordering]

    cluster_sampled_coords = np.array(sampled_coords)
    cluster_sampled_coords = cluster_sampled_coords[ordering]

    response_msg = {
        'corrMatrix': corr_matrix.tolist(),
        'lagMatrix': lag_matrix.tolist(),
        'clusterMatrix': z.tolist(),
        'clusterSampledCoords': cluster_sampled_coords.tolist(),
        'nClusterList': [len(cluster) for cluster in clusters],
        'ordering': ordering,
    }
    f = open("./expdata/clustermatrix-" + str(window_size) + ".json", "w")
    json.dump(response_msg, f)
    f.close()

    return response_msg
예제 #38
0
def test_mnist():
    import matplotlib.pylab as plt
    from PIL import Image, ImageOps
    mnist_dataset = _get_mnist_dataset()
    Y_2 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 2.)[0]]
    Y_3 = mnist_dataset['data'][np.where(mnist_dataset['target'] == 3.)[0]]
    print 'number of twos:', Y_2.shape[0]
    print 'number of threes:', Y_3.shape[0]
    _, D = Y_2.shape
    W = int(math.sqrt(D))
    assert W * W == D
    dtype = [('', bool)] * D
    Y = np.vstack([Y_2, Y_3])
    Y = np.array(
        [tuple(y) for y in Y[np.random.permutation(np.arange(Y.shape[0]))]],
        dtype=dtype)

    view = numpy_dataview(Y)
    defn = model_definition(Y.shape[0], [bb] * D)
    r = rng()
    s = initialize(defn,
                   view,
                   cluster_hp={'alpha': 0.2},
                   feature_hps=[{
                       'alpha': 1.,
                       'beta': 1.
                   }] * D,
                   r=r)
    bound_s = bind(s, view)

    indiv_prior_fn = log_exponential(1.2)
    hparams = {
        i: {
            'alpha': (indiv_prior_fn, 1.5),
            'beta': (indiv_prior_fn, 1.5),
        }
        for i in xrange(D)
    }

    def plot_clusters(s, fname, scalebysize=False):
        hps = [s.get_feature_hp(i) for i in xrange(D)]

        def prior_prob(hp):
            return hp['alpha'] / (hp['alpha'] + hp['beta'])

        def data_for_group(gid):
            suffstats = [s.get_suffstats(gid, i) for i in xrange(D)]

            def prob(hp, ss):
                top = hp['alpha'] + ss['heads']
                bot = top + hp['beta'] + ss['tails']
                return top / bot

            probs = [prob(hp, ss) for hp, ss in zip(hps, suffstats)]
            return np.array(probs)

        def scale(d, weight):
            im = d.reshape((W, W))
            newW = max(int(weight * W), 1)
            im = Image.fromarray(im)
            im = im.resize((newW, newW))
            im = ImageOps.expand(im, border=(W - newW) / 2)
            im = np.array(im)
            a, b = im.shape
            #print 'a,b:', a, b
            if a < W:
                im = np.append(im, np.zeros(b)[np.newaxis, :], axis=0)
            elif a > W:
                im = im[:W, :]
            assert im.shape[0] == W
            if b < W:
                #print 'current:', im.shape
                im = np.append(im, np.zeros(W)[:, np.newaxis], axis=1)
            elif b > W:
                im = im[:, :W]
            assert im.shape[1] == W
            return im.flatten()

        data = [(data_for_group(g), cnt) for g, cnt in groupsbysize(s)]
        largest = max(cnt for _, cnt in data)
        data = [
            scale(d, cnt / float(largest)) if scalebysize else d
            for d, cnt in data
        ]
        digits_per_row = 12
        rem = len(data) % digits_per_row
        if rem:
            fill = digits_per_row - rem
            for _ in xrange(fill):
                data.append(np.zeros(D))
        assert not (len(data) % digits_per_row)
        #rows = len(data) / digits_per_row
        data = np.vstack([
            np.hstack([d.reshape((W, W)) for d in data[i:i + digits_per_row]])
            for i in xrange(0, len(data), digits_per_row)
        ])
        #print 'saving figure', fname
        plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest')
        plt.savefig(fname)
        plt.close()

    def plot_hyperparams(s, fname):
        hps = [s.get_feature_hp(i) for i in xrange(D)]
        alphas = np.array([hp['alpha'] for hp in hps])
        betas = np.array([hp['beta'] for hp in hps])
        data = np.hstack([alphas.reshape((W, W)), betas.reshape((W, W))])
        plt.imshow(data, interpolation='nearest')
        plt.colorbar()
        plt.savefig(fname)
        plt.close()

    def kernel(rid):
        start0 = time.time()
        assign(bound_s, r)
        sec0 = time.time() - start0

        start1 = time.time()
        hp(bound_s, r, hparams=hparams)
        sec1 = time.time() - start1

        print 'rid=', rid, 'nclusters=', s.ngroups(), \
            'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'

        sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
        print '  time_per_post_pred=', sec_per_post_pred, 'sec'

        return s.score_joint(r)

    # burnin
    burnin = 20
    for rid in xrange(burnin):
        print 'score:', kernel(rid)
    print 'finished burnin'
    plot_clusters(s, 'mnist_clusters.pdf')
    plot_clusters(s, 'mnist_clusters_bysize.pdf', scalebysize=True)
    plot_hyperparams(s, 'mnist_hyperparams.pdf')
    print 'groupcounts:', groupcounts(s)

    # posterior predictions
    present = D / 2
    absent = D - present
    queries = [tuple(Y_2[i]) for i in np.random.permutation(Y_2.shape[0])[:4]] + \
              [tuple(Y_3[i]) for i in np.random.permutation(Y_3.shape[0])[:4]]

    queries_masked = ma.masked_array(np.array(queries, dtype=[('', bool)] * D),
                                     mask=[(False, ) * present +
                                           (True, ) * absent])

    def postpred_sample(y_new):
        Y_samples = [s.sample_post_pred(y_new, r)[1] for _ in xrange(1000)]
        Y_samples = np.array([list(y) for y in np.hstack(Y_samples)])
        Y_avg = Y_samples.mean(axis=0)
        return Y_avg

    queries_masked = [postpred_sample(y) for y in queries_masked]
    data0 = np.hstack([q.reshape((W, W)) for q in queries_masked])
    data1 = np.hstack([
        np.clip(np.array(q, dtype=np.float), 0., 1.).reshape((W, W))
        for q in queries
    ])
    data = np.vstack([data0, data1])
    plt.imshow(data, cmap=plt.cm.binary, interpolation='nearest')
    plt.savefig('mnist_predict.pdf')
    plt.close()
예제 #39
0
def _test_convergence(domains,
                      data,
                      reg_relations,
                      brute_relations,
                      kernel,
                      burnin_niters=10000,
                      skip=10,
                      ntries=50,
                      nsamples=1000,
                      places=2):
    r = rng()

    reg_defn = irm_definition(domains, reg_relations)
    brute_defn = irm_definition(domains, brute_relations)

    def score_fn(assignments):
        s = irm_initialize(brute_defn,
                           data,
                           r=r,
                           domain_assignments=assignments)
        assign = sum(s.score_assignment(i) for i in xrange(len(assignments)))
        likelihood = s.score_likelihood(r)
        return assign + likelihood

    product_assignments = tuple(map(list, map(permutation_iter, domains)))
    posterior = scores_to_probs(
        np.array(map(score_fn, it.product(*product_assignments))))

    s = irm_initialize(reg_defn, data, r=r)
    bounded_states = [irm_bind(s, i, data) for i in xrange(len(domains))]

    # burnin
    start = time.time()
    last = start
    for i in xrange(burnin_niters):
        for bs in bounded_states:
            kernel(bs, r)
        if not ((i + 1) % 1000):
            print 'burning finished iteration', (i + 1), \
                'in', (time.time() - last), 'seconds'
            last = time.time()
    print 'finished burnin of', burnin_niters, \
        'iters in', (time.time() - start), 'seconds'

    idmap = {C: i for i, C in enumerate(it.product(*product_assignments))}

    #print idmap

    def sample_fn():
        for _ in xrange(skip):
            for bs in bounded_states:
                kernel(bs, r)
        key = tuple(
            tuple(permutation_canonical(bs.assignments()))
            for bs in bounded_states)
        return idmap[key]

    assert_discrete_dist_approx(sample_fn,
                                posterior,
                                ntries=ntries,
                                nsamples=nsamples,
                                kl_places=places)
예제 #40
0
def test_gauss_cxx():
    import time
    _test_gauss(cxx_slice_sample, rng(int(time.time())))
예제 #41
0
def test_convergence_simple():
    N, V = 2, 10
    defn = model_definition(N, V)
    data = [
        np.array([5, 6]),
        np.array([0, 1, 2]),
    ]
    view = numpy_dataview(data)
    prng = rng()

    scores = []
    idmap = {}
    for i, (tables, dishes) in enumerate(permutations([2, 3])):
        latent = model.initialize(defn,
                                  view,
                                  prng,
                                  table_assignments=tables,
                                  dish_assignments=dishes)
        scores.append(latent.score_assignment() + latent.score_data(prng))
        idmap[(tables, dishes)] = i
    true_dist = scores_to_probs(scores)

    def kernel(latent):
        # mutates latent in place
        doc_model = model.bind(latent, data=view)
        kernels.assign2(doc_model, prng)
        for did in xrange(latent.nentities()):
            table_model = model.bind(latent, document=did)
            kernels.assign(table_model, prng)

    latent = model.initialize(defn, view, prng)

    skip = 10

    def sample_fn():
        for _ in xrange(skip):
            kernel(latent)
        table_assignments = latent.table_assignments()
        canon_table_assigments = tuple(
            map(tuple, map(permutation_canonical, table_assignments)))

        dish_maps = latent.dish_assignments()
        dish_assignments = []
        for dm, (ta, ca) in zip(dish_maps,
                                zip(table_assignments,
                                    canon_table_assigments)):
            dish_assignment = []
            for t, c in zip(ta, ca):
                if c == len(dish_assignment):
                    dish_assignment.append(dm[t])
            dish_assignments.append(dish_assignment)

        canon_dish_assigments = tuple(
            map(tuple, map(permutation_canonical, dish_assignments)))

        return idmap[(canon_table_assigments, canon_dish_assigments)]

    assert_discrete_dist_approx(sample_fn,
                                true_dist,
                                ntries=100,
                                nsamples=10000,
                                kl_places=2)
def test_multivariate_models_cxx():
    _test_multivariate_models(initialize, numpy_dataview, bind, gibbs_assign,
                              rng())
예제 #43
0
def test_mnist_supervised():
    mnist_dataset = _get_mnist_dataset()
    classes = range(10)
    classmap = {c: i for i, c in enumerate(classes)}
    train_data, test_data = [], []
    for c in classes:
        Y = mnist_dataset['data'][np.where(
            mnist_dataset['target'] == float(c))[0]]
        Y_train, Y_test = train_test_split(Y, test_size=0.01)
        train_data.append(Y_train)
        test_data.append(Y_test)

    sample_size_max = 10000

    def mk_class_data(c, Y):
        n, D = Y.shape
        print 'number of digit', c, 'in training is', n
        dtype = [('', bool)] * D + [('', int)]
        inds = np.random.permutation(Y.shape[0])[:sample_size_max]
        Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]],
                     dtype=dtype)
        return Y

    Y_train = np.hstack(
        [mk_class_data(c, y) for c, y in zip(classes, train_data)])
    Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))]

    n, = Y_train.shape
    D = len(Y_train.dtype)
    print 'training data is', n, 'examples'
    print 'image dimension is', (D - 1), 'pixels'

    view = numpy_dataview(Y_train)
    defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))])
    r = rng()
    s = initialize(defn,
                   view,
                   cluster_hp={'alpha': 0.2},
                   feature_hps=[{
                       'alpha': 1.,
                       'beta': 1.
                   }] * (D - 1) + [{
                       'alphas': [1. for _ in classes]
                   }],
                   r=r)

    bound_s = bind(s, view)

    indiv_prior_fn = log_exponential(1.2)
    hparams = {
        i: {
            'alpha': (indiv_prior_fn, 1.5),
            'beta': (indiv_prior_fn, 1.5),
        }
        for i in xrange(D - 1)
    }
    hparams[D - 1] = {
        'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5)
        for idx in xrange(len(classes))
    }

    def print_prediction_results():
        results = []
        for c, Y_test in zip(classes, test_data):
            for y in Y_test:
                query = ma.masked_array(
                    np.array([tuple(y) + (0, )],
                             dtype=[('', bool)] * (D - 1) + [('', int)]),
                    mask=[(False, ) * (D - 1) + (True, )])[0]
                samples = [
                    s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)
                ]
                samples = np.bincount(samples, minlength=len(classes))
                prediction = np.argmax(samples)
                results.append((classmap[c], prediction, samples))
            print 'finished predictions for class', c

        Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
        Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
        print 'accuracy:', accuracy_score(Y_actual, Y_pred)
        print 'confusion matrix:'
        print confusion_matrix(Y_actual, Y_pred)

        # AUROC for one vs all (each class)
        for i, clabel in enumerate(classes):
            Y_true = np.copy(Y_actual)

            # treat class c as the "positive" example
            positive_examples = Y_actual == i
            negative_examples = Y_actual != i
            Y_true[positive_examples] = 1
            Y_true[negative_examples] = 0
            Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
            cls_auc = roc_auc_score(Y_true, Y_prob)
            print 'class', clabel, 'auc=', cls_auc

        #import matplotlib.pylab as plt
        #Y_prob = np.array([c for _, _, c in results])
        #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0)
        #plt.plot(fpr, tpr)
        #plt.show()

    def kernel(rid):
        start0 = time.time()
        assign(bound_s, r)
        sec0 = time.time() - start0

        start1 = time.time()
        hp(bound_s, r, hparams=hparams)
        sec1 = time.time() - start1

        print 'rid=', rid, 'nclusters=', s.ngroups(), \
            'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'

        sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
        print '  time_per_post_pred=', sec_per_post_pred, 'sec'

        # print group size breakdown
        sizes = [(gid, s.groupsize(gid)) for gid in s.groups()]
        sizes = sorted(sizes, key=lambda x: x[1], reverse=True)
        print '  group_sizes=', sizes

        print_prediction_results()

        # save state
        mkdirp("mnist-states")
        fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid))
        with open(fname, "w") as fp:
            fp.write(s.serialize())

    # training
    iters = 30
    for rid in xrange(iters):
        kernel(rid)
예제 #44
0
def test_compare_to_mixture_model():
    r = rng()

    N, D = 4, 5

    Y = np.random.uniform(size=(N, D)) > 0.8
    Y_rec = np.array([tuple(y) for y in Y], dtype=[('', bool)] * D)

    mm_view = rec_numpy_dataview(Y_rec)
    irm_view = relation_numpy_dataview(Y)

    mm_def = mm_definition(N, [bb] * D)
    irm_def = irm_definition([N, D], [((0, 1), bb)])

    perms = list(permutation_iter(N))
    assignment = perms[np.random.randint(0, len(perms))]

    mm_s = mm_initialize(mm_def, mm_view, r=r, assignment=assignment)
    irm_s = irm_initialize(irm_def, [irm_view],
                           r=r,
                           domain_assignments=[
                               assignment,
                               range(D),
                           ])

    def assert_suff_stats_equal():
        assert set(mm_s.groups()) == set(irm_s.groups(0))
        assert irm_s.groups(1) == range(D)
        groups = mm_s.groups()
        for g in groups:
            for i in xrange(D):
                a = mm_s.get_suffstats(g, i)
                b = irm_s.get_suffstats(0, [g, i])
                if b is None:
                    b = {'heads': 0L, 'tails': 0L}
                assert a['heads'] == b['heads'] and a['tails'] == b['tails']

    assert_suff_stats_equal()
    assert_almost_equals(mm_s.score_assignment(),
                         irm_s.score_assignment(0),
                         places=3)

    bound_mm_s = mm_bind(mm_s, mm_view)
    bound_irm_s = irm_bind(irm_s, 0, [irm_view])

    # XXX: doesn't really have to be true, just is true of impl
    assert not bound_mm_s.empty_groups()
    assert not bound_irm_s.empty_groups()

    bound_mm_s.create_group(r)
    bound_irm_s.create_group(r)

    gid_a = bound_mm_s.remove_value(0, r)
    gid_b = bound_irm_s.remove_value(0, r)

    assert gid_a == gid_b
    assert_suff_stats_equal()

    x0, y0 = bound_mm_s.score_value(0, r)
    x1, y1 = bound_irm_s.score_value(0, r)
    assert x0 == x1  # XXX: not really a requirement

    # XXX: should really normalize and then check
    for a, b in zip(y0, y1):
        assert_almost_equals(a, b, places=2)
예제 #45
0
def test_crp():
    for alpha in (0.1, 1.0, 10.0):
        _test_crp(initialize, numpy_dataview, alpha=alpha, r=rng())