示例#1
0
def data_with_posterior(defn,
                        cluster_hp=None,
                        feature_hps=None,
                        preprocess_data_fn=None,
                        r=None):
    # XXX(stephentu): should only accept conjugate models
    if r is None:
        r = rng()
    Y_clusters, _ = sample(defn, cluster_hp, feature_hps, r)
    Y = np.hstack(Y_clusters)
    if preprocess_data_fn:
        Y = preprocess_data_fn(Y)
    data = numpy_dataview(Y)

    def score_fn(assignment):
        s = initialize(defn,
                       data,
                       r,
                       cluster_hp=cluster_hp,
                       feature_hps=feature_hps,
                       assignment=assignment)
        return s.score_joint(r)

    posterior = dist_on_all_clusterings(score_fn, defn.n())
    return Y, posterior
示例#2
0
def test_sample_sanity():
    # just a sanity check
    defn = model_definition(10, [bb, bnb, gp, nich, dd(5), niw(4)])
    clusters, samplers = sample(defn)
    assert_equals(len(clusters), len(samplers))
    for cluster in clusters:
        assert_true(len(cluster) > 0)
        for v in cluster:
            assert_equals(len(v), len(defn.models()))
def _test_nonconj_inference(initialize_fn,
                            dataview,
                            bind,
                            assign_nonconj_fn,
                            slice_theta_fn,
                            R,
                            ntries,
                            nsamples,
                            tol):
    N, D = 1000, 5
    defn = model_definition(N, [bbnc] * D)
    cluster_hp = {'alpha': 0.2}
    feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D

    while True:
        Y_clustered, cluster_samplers = sample(
            defn, cluster_hp, feature_hps, R)
        if len(Y_clustered) == 2:
            break
    dominant = np.argmax(map(len, Y_clustered))
    truth = np.array([s.p for s in cluster_samplers[dominant]])
    print 'truth:', truth

    # see if we can learn the p-values for each of the two clusters. we proceed
    # by running gibbs_assign_nonconj, followed by slice sampling on the
    # posterior p(\theta | Y). we'll "cheat" a little by bootstrapping the
    # DP with the correct assignment (but not with the correct p-values)
    Y, assignment = data_with_assignment(Y_clustered)
    view = dataview(Y)
    s = initialize_fn(
        defn, view, cluster_hp=cluster_hp,
        feature_hps=feature_hps, assignment=assignment, r=R)
    bs = bind(s, view)

    def mkparam():
        return {'p': 0.1}
    thetaparams = {fi: mkparam() for fi in xrange(D)}

    def kernel():
        assign_nonconj_fn(bs, 10, R)
        slice_theta_fn(bs, R, tparams=thetaparams)

    def inference(niters):
        for _ in xrange(niters):
            kernel()
            groups = s.groups()
            inferred_dominant = groups[
                np.argmax([s.groupsize(gid) for gid in groups])]
            inferred = [s.get_suffstats(inferred_dominant, d)['p']
                        for d in xrange(D)]
            inferred = np.array(inferred)
            yield inferred

    posterior = []
    while ntries:
        samples = list(inference(nsamples))
        posterior.extend(samples)
        inferred = sum(posterior) / len(posterior)
        diff = np.linalg.norm(truth - inferred)
        print 'inferred:', inferred
        print 'diff:', diff
        if diff <= tol:
            return
        ntries -= 1
        print 'tries left:', ntries

    assert False, 'did not converge'
def _test_nonconj_inference(initialize_fn, dataview, bind, assign_nonconj_fn,
                            slice_theta_fn, R, ntries, nsamples, tol):
    N, D = 1000, 5
    defn = model_definition(N, [bbnc] * D)
    cluster_hp = {'alpha': 0.2}
    feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D

    while True:
        Y_clustered, cluster_samplers = sample(defn, cluster_hp, feature_hps,
                                               R)
        if len(Y_clustered) == 2:
            break
    dominant = np.argmax(map(len, Y_clustered))
    truth = np.array([s.p for s in cluster_samplers[dominant]])
    print 'truth:', truth

    # see if we can learn the p-values for each of the two clusters. we proceed
    # by running gibbs_assign_nonconj, followed by slice sampling on the
    # posterior p(\theta | Y). we'll "cheat" a little by bootstrapping the
    # DP with the correct assignment (but not with the correct p-values)
    Y, assignment = data_with_assignment(Y_clustered)
    view = dataview(Y)
    s = initialize_fn(defn,
                      view,
                      cluster_hp=cluster_hp,
                      feature_hps=feature_hps,
                      assignment=assignment,
                      r=R)
    bs = bind(s, view)

    def mkparam():
        return {'p': 0.1}

    thetaparams = {fi: mkparam() for fi in xrange(D)}

    def kernel():
        assign_nonconj_fn(bs, 10, R)
        slice_theta_fn(bs, R, tparams=thetaparams)

    def inference(niters):
        for _ in xrange(niters):
            kernel()
            groups = s.groups()
            inferred_dominant = groups[np.argmax(
                [s.groupsize(gid) for gid in groups])]
            inferred = [
                s.get_suffstats(inferred_dominant, d)['p'] for d in xrange(D)
            ]
            inferred = np.array(inferred)
            yield inferred

    posterior = []
    while ntries:
        samples = list(inference(nsamples))
        posterior.extend(samples)
        inferred = sum(posterior) / len(posterior)
        diff = np.linalg.norm(truth - inferred)
        print 'inferred:', inferred
        print 'diff:', diff
        if diff <= tol:
            return
        ntries -= 1
        print 'tries left:', ntries

    assert False, 'did not converge'
示例#5
0
def toy_dataset(defn):
    samples, _ = sample(defn)
    return np.hstack(samples)