def data_with_posterior(defn, cluster_hp=None, feature_hps=None, preprocess_data_fn=None, r=None): # XXX(stephentu): should only accept conjugate models if r is None: r = rng() Y_clusters, _ = sample(defn, cluster_hp, feature_hps, r) Y = np.hstack(Y_clusters) if preprocess_data_fn: Y = preprocess_data_fn(Y) data = numpy_dataview(Y) def score_fn(assignment): s = initialize(defn, data, r, cluster_hp=cluster_hp, feature_hps=feature_hps, assignment=assignment) return s.score_joint(r) posterior = dist_on_all_clusterings(score_fn, defn.n()) return Y, posterior
def test_sample_sanity(): # just a sanity check defn = model_definition(10, [bb, bnb, gp, nich, dd(5), niw(4)]) clusters, samplers = sample(defn) assert_equals(len(clusters), len(samplers)) for cluster in clusters: assert_true(len(cluster) > 0) for v in cluster: assert_equals(len(v), len(defn.models()))
def _test_nonconj_inference(initialize_fn, dataview, bind, assign_nonconj_fn, slice_theta_fn, R, ntries, nsamples, tol): N, D = 1000, 5 defn = model_definition(N, [bbnc] * D) cluster_hp = {'alpha': 0.2} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D while True: Y_clustered, cluster_samplers = sample( defn, cluster_hp, feature_hps, R) if len(Y_clustered) == 2: break dominant = np.argmax(map(len, Y_clustered)) truth = np.array([s.p for s in cluster_samplers[dominant]]) print 'truth:', truth # see if we can learn the p-values for each of the two clusters. we proceed # by running gibbs_assign_nonconj, followed by slice sampling on the # posterior p(\theta | Y). we'll "cheat" a little by bootstrapping the # DP with the correct assignment (but not with the correct p-values) Y, assignment = data_with_assignment(Y_clustered) view = dataview(Y) s = initialize_fn( defn, view, cluster_hp=cluster_hp, feature_hps=feature_hps, assignment=assignment, r=R) bs = bind(s, view) def mkparam(): return {'p': 0.1} thetaparams = {fi: mkparam() for fi in xrange(D)} def kernel(): assign_nonconj_fn(bs, 10, R) slice_theta_fn(bs, R, tparams=thetaparams) def inference(niters): for _ in xrange(niters): kernel() groups = s.groups() inferred_dominant = groups[ np.argmax([s.groupsize(gid) for gid in groups])] inferred = [s.get_suffstats(inferred_dominant, d)['p'] for d in xrange(D)] inferred = np.array(inferred) yield inferred posterior = [] while ntries: samples = list(inference(nsamples)) posterior.extend(samples) inferred = sum(posterior) / len(posterior) diff = np.linalg.norm(truth - inferred) print 'inferred:', inferred print 'diff:', diff if diff <= tol: return ntries -= 1 print 'tries left:', ntries assert False, 'did not converge'
def _test_nonconj_inference(initialize_fn, dataview, bind, assign_nonconj_fn, slice_theta_fn, R, ntries, nsamples, tol): N, D = 1000, 5 defn = model_definition(N, [bbnc] * D) cluster_hp = {'alpha': 0.2} feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D while True: Y_clustered, cluster_samplers = sample(defn, cluster_hp, feature_hps, R) if len(Y_clustered) == 2: break dominant = np.argmax(map(len, Y_clustered)) truth = np.array([s.p for s in cluster_samplers[dominant]]) print 'truth:', truth # see if we can learn the p-values for each of the two clusters. we proceed # by running gibbs_assign_nonconj, followed by slice sampling on the # posterior p(\theta | Y). we'll "cheat" a little by bootstrapping the # DP with the correct assignment (but not with the correct p-values) Y, assignment = data_with_assignment(Y_clustered) view = dataview(Y) s = initialize_fn(defn, view, cluster_hp=cluster_hp, feature_hps=feature_hps, assignment=assignment, r=R) bs = bind(s, view) def mkparam(): return {'p': 0.1} thetaparams = {fi: mkparam() for fi in xrange(D)} def kernel(): assign_nonconj_fn(bs, 10, R) slice_theta_fn(bs, R, tparams=thetaparams) def inference(niters): for _ in xrange(niters): kernel() groups = s.groups() inferred_dominant = groups[np.argmax( [s.groupsize(gid) for gid in groups])] inferred = [ s.get_suffstats(inferred_dominant, d)['p'] for d in xrange(D) ] inferred = np.array(inferred) yield inferred posterior = [] while ntries: samples = list(inference(nsamples)) posterior.extend(samples) inferred = sum(posterior) / len(posterior) diff = np.linalg.norm(truth - inferred) print 'inferred:', inferred print 'diff:', diff if diff <= tol: return ntries -= 1 print 'tries left:', ntries assert False, 'did not converge'
def toy_dataset(defn): samples, _ = sample(defn) return np.hstack(samples)