def test_piecewise_logpdf(): pw = PieceWise([0, 1], [2], sigma=1, flip=.8) # x,z pw.simulate(None, [0, 1], None, {2: 1}) pw.logpdf(None, {0: 1.5, 1: 0}, None, {2: 1}) # x pw.simulate(None, [0], None, {2: 1}) pw.logpdf(None, {0: 1.5}, None, {2: 1}) # z pw.simulate(None, [1], None, {2: 1}) assert np.allclose( logsumexp([ pw.logpdf(None, {1: 0}, None, {2: 1}), pw.logpdf(None, {1: 1}, None, {2: 1}) ]), 0) # z|x pw.simulate(None, [1], {0: 1.5}, {2: 1}) assert np.allclose( logsumexp([ pw.logpdf(None, {1: 0}, {0: 1.5}, {2: 1}), pw.logpdf(None, {1: 1}, {0: 1.5}, {2: 1}) ]), 0) # x|z pw.simulate(None, [0], {1: 0}, {2: 1}) pw.logpdf(None, {0: 1.5}, {1: 0}, {2: 1})
def test_crp_posterior_logpdf(): view = retrieve_view() fresh_row = {0: 2, 1: 3, 2: .5} logps = [ view.logpdf(None, {view.outputs[0]: k}, fresh_row) for k in [0, 1, 2] ] assert np.allclose(gu.logsumexp(logps), 0)
def test_one(forest, c): D_sub = [(i, row) for (i, row) in enumerate(D) if row[0] not in c] for rowid, row in D_sub: inputs = {i: row[i] for i in forest.inputs} targets = [{0: x} for x in xrange(NUM_CLASSES)] lps = [forest.logpdf(rowid, q, None, inputs) for q in targets] assert np.allclose(gu.logsumexp(lps), 0)
def calc_predictive_logp(x, y, regressor, counts, alpha): logp_uniform = -np.log(len(counts)) if not hasattr(regressor, 'classes_'): return logp_uniform elif x not in regressor.classes_: return np.log(alpha) + logp_uniform else: index = list(regressor.classes_).index(x) logp_rf = regressor.predict_log_proba([y])[0][index] return gu.logsumexp( [np.log(alpha) + logp_uniform, np.log(1 - alpha) + logp_rf])
def view_logpdf(view, rowid, targets, constraints): if not view.hypothetical(rowid): return _logpdf_row(view, targets, view.Zr(rowid)) Nk = view.Nk() N_rows = len(view.Zr()) K = view.crp.clusters[0].gibbs_tables(-1) lp_crp = [Crp.calc_predictive_logp(k, N_rows, Nk, view.alpha()) for k in K] lp_constraints = [_logpdf_row(view, constraints, k) for k in K] if all(np.isinf(lp_constraints)): raise ValueError('Zero density constraints: %s' % (constraints, )) lp_cluster = log_normalize(np.add(lp_crp, lp_constraints)) lp_targets = [_logpdf_row(view, targets, k) for k in K] return logsumexp(np.add(lp_cluster, lp_targets))
def logpdf(self, rowid, targets, constraints=None, inputs=None): constraints = self.populate_constraints(rowid, targets, constraints) # XXX Disable logpdf queries without constraints. if inputs: raise ValueError('Prohibited inputs: %s' % (inputs,)) if not constraints: raise ValueError('Provide at least one constraint: %s' % (constraints,)) self._validate_simulate_logpdf(rowid, targets, constraints) # Retrieve the dataset and neighborhoods. dataset, neighborhoods = self._find_neighborhoods(targets, constraints) models = [self._create_local_model_joint(targets, dataset[n]) for n in neighborhoods] # Compute logpdf in each neighborhood and simple average. lp = [m.logpdf(targets) for m in models] return gu.logsumexp(lp) - np.log(len(models))
def test_bernoulli(): # Switch for multiprocess (0 is faster). multiprocess = 0 # Create categorical data of DATA_NUM_0 zeros and DATA_NUM_1 ones. data = np.transpose(np.array([[0] * DATA_NUM_0 + [1] * DATA_NUM_1])) # Run a single chain for a few iterations. engine = Engine(data, cctypes=['categorical'], distargs=[{ 'k': 2 }], rng=gu.gen_rng(0), multiprocess=0) engine.transition(NUM_ITER, multiprocess=multiprocess) # Simulate from hypothetical row and compute the proportion of ones. sample = engine.simulate(-1, [0], N=NUM_SIM, multiprocess=multiprocess)[0] sum_b = sum(s[0] for s in sample) observed_prob_of_1 = (float(sum_b) / float(NUM_SIM)) true_prob_of_1 = float(DATA_NUM_1) / float(DATA_NUM_0 + DATA_NUM_1) # Check 1% relative match. assert np.allclose(true_prob_of_1, observed_prob_of_1, rtol=.1) # Simulate from observed row as a crash test. sample = engine.simulate(1, [0], N=1, multiprocess=multiprocess) # Ensure normalized unobserved probabilities. p0_uob = engine.logpdf(-1, {0: 0}, multiprocess=multiprocess)[0] p1_uob = engine.logpdf(-1, {0: 1}, multiprocess=multiprocess)[0] assert np.allclose(gu.logsumexp([p0_uob, p1_uob]), 0) # A logpdf query constraining an observed returns an error. with pytest.raises(ValueError): engine.logpdf(1, {0: 0}, multiprocess=multiprocess) with pytest.raises(ValueError): engine.logpdf(1, {0: 1}, multiprocess=multiprocess)
def logpdf(self, rowid, targets, constraints=None, inputs=None): assert targets assert inputs.keys() == self.inputs y = inputs[self.inputs[0]] # Case 1: No evidence on outputs. if not constraints: # Case 1.1: z in the targets and x in the targets. if self.outputs[0] in targets and self.outputs[1] in targets: z, x = targets[self.outputs[1]], targets[self.outputs[1]] # XXX Check if z in [0, 1] logp_z = np.log(self.flip) if z == 0 else np.log(1 - self.flip) logp_x = logpdf_normal(x, y + (2 * z - 1), self.sigma) logp = logp_x + logp_z # Case 1.2: z in the targets only. elif self.outputs[1] in targets: z = targets[self.outputs[1]] logp_z = np.log(self.flip) if z == 0 else np.log(1 - self.flip) logp = logp_z # Case 1.2: x in the targets only. elif self.outputs[0] in targets: x = targets[self.outputs[0]] logp_xz0 = self.logpdf(rowid, { self.outputs[0]: x, self.outputs[1]: 0 }, constraints, inputs) logp_xz1 = self.logpdf( rowid, { self.outputs[0]: x, self.outputs[1]: 1 }, constraints, inputs, ) logp = gu.logsumexp([logp_xz0, logp_xz1]) else: raise ValueError('Invalid query pattern: %s %s %s' % (targets, constraints, inputs)) # Case 2: logpdf of x given the z. elif constraints.keys() == [self.outputs[1]]: assert targets.keys() == [self.outputs[0]] z = constraints[self.outputs[1]] x = targets[self.outputs[0]] logp_xz = self.logpdf(rowid, { self.outputs[0]: x, self.outputs[1]: z }, None, {self.inputs[0]: y}) logp_z = self.logpdf(rowid, {self.outputs[1]: z}, None, {self.inputs[0]: y}) logp = logp_xz - logp_z # Case 2: logpdf of z given the x. elif constraints.keys() == [self.outputs[0]]: assert targets.keys() == [self.outputs[1]] z = targets[self.outputs[1]] x = constraints[self.outputs[0]] logp_xz = self.logpdf(rowid, { self.outputs[0]: x, self.outputs[1]: z }, None, {self.inputs[0]: y}) logp_x = self.logpdf(rowid, {self.outputs[0]: x}, None, {self.inputs[0]: y}) logp = logp_xz - logp_x else: raise ValueError('Invalid query pattern: %s %s %s' % (targets, constraints, inputs)) return logp
def relevance_probability(view, rowid_target, rowid_query): """Compute probability of customers in same table. Given a single target rowid T and list of query rowids Q, compute the posterior probability that T and all rowids in Q are assigned to the same table, conditioned on all rowids in Q being assigned to the same table as well as the row data values xT and xQ. Let S be the event of all rowids in Q are assigned to the same table: S = [zQ[0] = zQ[1] = ... = zQ[-1]] The first quantity of interest is; Pr[zT = zQ | xT, xQ, S] = Pr[zT = zQ, xT, xQ, S] / Pr[xT, xQ, S] The numerator is: Pr[zT = zQ, xT, xQ, S] = \sum_k Pr[zT=k, zQ=k, xT, xQ] = \sum_k Pr[xT, xQ | zT=K, zQ=k] * Pr[zT=k, zQ=k] where k is list of tables in the CRP plus a fresh singleton. The second quantity of interest is: Pr[zT \ne zQ | xT, xQ, S] = Pr[zT \ne zQ, xT, xQ, S] / Pr[xT, xQ, S] The numerator is: Pr[zT \ne zQ, xT, xQ, S] = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ, xT, xQ] = \sum_kT \sum_kQ|kT Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ] where kT is list of tables in the CRP plus a fresh singleton, and kQ|kT in the inner sum is all tables in the CRP other than kT (plus a fresh singleton when kT is itself a singleton). For example if the tables are [0, 1] then: kT = [0, 1, 2] kQ|kT = [[1, 2], [0, 2], [0,1,3] If computation is correct then the first and second quantities are equal to the normalizer, which is given by: Pr[xT, xQ, S] = \sum_kQ Pr[zQ[0]=kQ, ..., zQ[-1]=kQ, xT, xQ] = \sum_kQ Pr[xT, xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ] = \sum_kQ (\sum_kT Pr[xT, zT=kT]) Pr[xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ] = \sum_kQ (\sum_kT Pr[xT|zT=kT] * Pr[zT=kT| zQ=kQ]) Pr[xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ] where kQ is list of tables in the CRP plus a fresh singleton. The inner sum over kT computes the predictive density of xT when all the rows in Q are in table kQ marginalizing over all assignments. Parameters ---------- view : cgpm.mixtures.View View CGPM representing the DP mixture. rowid_target : int The target rowid, must be incorporate in the view. rowid_query : list<int> The query rowids, must be incorporated in the view. Returns ------- relevance_probability : float The posterior probability the target is in the same cluster as query. """ if len(rowid_query) < 1: raise ValueError('No query rows:, %s' % (rowid_query)) if rowid_target in rowid_query: return 1. # Retrieve target crp assignments and data to restore later. assignments_target = view.Zr(rowid_target) values_target = row_values(view, rowid_target) # Retrieve query crp assignments and data to restore later. values_query = [row_values(view, r) for r in rowid_query] assignments_query = [view.Zr(r) for r in rowid_query] # Retrieve view logpdf to verify no mutation afterwards. if check_env_debug(): logpdf_score_full = view.logpdf_score() # Unincorporate target and query rows. view.unincorporate(rowid_target) for rowid_q in rowid_query: view.unincorporate(rowid_q) # Retrieve current tables. tables_crp = sorted(view.crp.clusters[0].counts) # Retrieve cluster-wise marginal likelhoods. tables_same = get_tables_same(tables_crp) logps_clusters = [get_view_logpdf_score(view, t, t) for t in tables_same] # Compute Pr[xT, xQ, S] # = \sum_kT \sum_kQ Pr[zT=kT, zQ=kQ, xT, xQ] # = \sum_kT \sum_kQ Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ] logps_condition = [ logpdf_assignments_marginalize_target(view, rowid_query, rowid_query, values_target, values_query, table_query) for table_query in tables_same ] logp_condition = logsumexp(np.subtract(logps_condition, logps_clusters)) # Compute Pr[zT = zQ, xT, xQ, S] # = \sum_k Pr[zT=k, zQ=k, xT, xQ] # = \sum_k Pr[xT, xQ | zT=K, zQ=k] * Pr[zT=k, zQ=k] logps_same_table = [ logpdf_assignments( view, rowid_target, rowid_query, values_target, values_query, table, table, ) for table in tables_same ] logp_same_table = logsumexp(np.subtract(logps_same_table, logps_clusters)) # ---------------------------------------------------------------------- # The following computation is not necessary and introduces O(K^2) # overhead due to the nested sum, but serves as a vital check for # correct implementation (as noted in the docstring.) # ---------------------------------------------------------------------- # Compute Pr[zT \ne zQ, xT, xQ, S] # = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ, xT, xQ] # = \sum_kT \sum_kQ|kT Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ] if check_env_debug(): tables_target, tables_query = get_tables_different(tables_crp) # Compute the base logps. logps_clusters_diff = [[ get_view_logpdf_score( view, table_target, table_q, ) for table_q in table_query ] for table_target, table_query in zip(tables_target, tables_query)] # Compute the new logps. logps_diff_table = [[ logpdf_assignments( view, rowid_target, rowid_query, values_target, values_query, table_target, table_q, ) for table_q in table_query ] for table_target, table_query in zip(tables_target, tables_query)] # Compute the deltas. logps_delta = [ np.subtract(a, b) for (a, b) in zip(logps_diff_table, logps_clusters_diff) ] # Sum the deltas. logp_diff_table = logsumexp([logsumexp(l) for l in logps_delta]) # Confirm logp_same_table + logp_diff_table equal normalizing constant. assert np.allclose(logsumexp([logp_same_table, logp_diff_table]), logp_condition) # Restore the target row. values_target[view.outputs[0]] = assignments_target view.incorporate(rowid_target, values_target) # Restore the query rows. for rowid, values, z in zip(rowid_query, values_query, assignments_query): values[view.outputs[0]] = z view.incorporate(rowid, values) # Confirm no mutation has occured. if check_env_debug(): assert np.allclose(view.logpdf_score(), logpdf_score_full) # Return the relevance probability. return np.exp(logp_same_table - logp_condition)
def logpdf(self, rowid, targets, constraints=None, inputs=None): # As discussed in https://github.com/probcomp/cgpm/issues/116 for an # observed rowid, we synthetize a new hypothetical row which is # identical (in terms of observed and latent values) to the observed # rowid. In this version of the implementation, the user may not # override any non-null values in the observed rowid # (_populate_constraints returns an error in this case). A user should # either (i) use another rowid, since overriding existing values in the # observed rowid no longer specifies that rowid, or (ii) use some # sequence of incorporate/unicorporate depending on their query. constraints = self._populate_constraints(rowid, targets, constraints) if not self.hypothetical(rowid): rowid = None # Prepare the importance network. network = self.build_network() if self.outputs[0] in constraints: # Condition on the cluster assignment. # p(xT|xC,z=k) computed directly by network. return network.logpdf(rowid, targets, constraints, inputs) elif self.outputs[0] in targets: # Query the cluster assignment. # p(z=k,xT|xC) # = p(z=k,xT,xC) / p(xC) Bayes rule # = p(z=k)p(xT,xC|z=k) / p(xC) chain rule on numerator # The terms are then: # p(z=k) lp_cluster # p(xT,xC|z=k) lp_numer # p(xC) lp_denom k = targets[self.outputs[0]] constraints_z = {self.outputs[0]: k} targets_nz = { c: targets[c] for c in targets if c != self.outputs[0] } targets_numer = merged(targets_nz, constraints) lp_cluster = network.logpdf(rowid, constraints_z, inputs) lp_numer = \ network.logpdf(rowid, targets_numer, constraints_z, inputs) \ if targets_numer else 0 lp_denom = self.logpdf(rowid, constraints) if constraints else 0 return (lp_cluster + lp_numer) - lp_denom else: # Marginalize over cluster assignment by enumeration. # Let K be a list of values for the support of z: # P(xT|xC) # = \sum_k p(xT|z=k,xC)p(z=k|xC) marginalization # Now consider p(z=k|xC) \propto p(z=k,xC) Bayes rule # p(z=K[i],xC) lp_constraints_unorm[i] # p(z=K[i]|xC) lp_constraints[i] # p(xT|z=K[i],xC) lp_targets[i] K = self.crp.clusters[0].gibbs_tables(-1) constraints = [ merged(constraints, {self.outputs[0]: k}) for k in K ] lp_constraints_unorm = [ network.logpdf(rowid, const, None, inputs) for const in constraints ] lp_constraints = gu.log_normalize(lp_constraints_unorm) lp_targets = [ network.logpdf(rowid, targets, const, inputs) for const in constraints ] return gu.logsumexp(np.add(lp_constraints, lp_targets))
def test_logsumexp(): inf = float('inf') nan = float('nan') with pytest.raises(OverflowError): math.log(sum(map(math.exp, range(1000)))) assert relerr(999.4586751453871, gu.logsumexp(range(1000))) < 1e-15 assert gu.logsumexp([]) == -inf assert gu.logsumexp([-1000.]) == -1000. assert gu.logsumexp([-1000., -1000.]) == -1000. + math.log(2.) assert relerr(math.log(2.), gu.logsumexp([0., 0.])) < 1e-15 assert gu.logsumexp([-inf, 1]) == 1 assert gu.logsumexp([-inf, -inf]) == -inf assert gu.logsumexp([+inf, +inf]) == +inf assert math.isnan(gu.logsumexp([-inf, +inf])) assert math.isnan(gu.logsumexp([nan, inf])) assert math.isnan(gu.logsumexp([nan, -3]))
def logpdf(self, rowid, targets, constraints=None, inputs=None): if rowid in self.rowid_to_component: # Condition on the cluster assignment directly. # p(xT|xC,z=k) assert not constraints or self.indexer not in constraints z = self.rowid_to_component[rowid] return self._logpdf_one(rowid, targets, constraints, inputs, z) elif self.indexer in targets: # Query the cluster assignment. # p(z=k,xT|xC) # = p(z=k,xT,xC) / p(xC) Bayes rule # = p(z=k)p(xT,xC|z=k) / p(xC) chain rule on numerator # The terms are then: # p(z=k) lp_z # p(xT,xC|z=k) lp_x_joint # p(xC) = \sum_z P(xC,z) lp_x_constraints (recursively) z = targets[self.indexer] inputs_z = get_intersection(self.inputs_z, inputs) lp_z = self.cgpm_row_divide.logpdf(rowid=rowid, targets={self.indexer: z}, constraints=None, inputs=inputs_z) targets_joint = merged(targets, constraints or {}) lp_x_joint = self._logpdf_one(rowid=rowid, targets=targets_joint, constraints=None, inputs=inputs, component=z) lp_x_constraints = self.logpdf(rowid=rowid, targets=constraints, constraints=None, inputs=inputs) if constraints else 0 return (lp_z + lp_x_joint) - lp_x_constraints elif constraints and self.indexer in constraints: # Condition on the cluster assignment # P(xT|xC,z=k) # = P(xT,xC,z=k) / P(xC,z=k) # = P(xT,xC|z=k)P(z=k) / P(xC|z=k) # = P(xT,xC|z=k) / P(xC|z=k) # The terms are then: # P(xT,xC|z=k) lp_x_joint # P(xC|z=k) lp_x_constraints z = constraints[self.indexer] if z not in self.cgpm_row_divide.support(): raise ValueError('Constrained cluster has 0 density: %s' % (z, )) targets_joint = merged(targets, constraints) lp_x_joint = self._logpdf_one(rowid=rowid, targets=targets_joint, constraints=None, inputs=inputs, component=z) lp_x_constraints = self._logpdf_one(rowid=rowid, targets=constraints, constraints=None, inputs=inputs, component=z) return lp_x_joint - lp_x_constraints else: # Marginalize over cluster assignment by enumeration. # Let K be a list of values for the support of z: # P(xT|xC) # = \sum_i P(xT,z=K[i]|xC) # = \sum_i P(xT|xC,z=K[i])P(z=K[i]|xC) chain rule # # The posterior is given by: # P(z=K[i]|xC) = P(xC|z=K[i])P(z=K[i]) / \sum_i P(xC,z=K[i]) # # The terms are therefore # P(z=K[i]) lp_z_prior[i] # P(xC|z=K[i]) lp_constraints_likelihood[i] # P(xC,z=K[i]) lp_z_constraints[i] # P(z=K[i]|xC) lp_z_posterior[i] # P(xT|xC,z=K[i]) lp_targets_likelihood[i] # P(xT|xC,z=K[i])P(z=K[i]|xC) lp_joint[i] inputs_z = get_intersection(self.inputs_z, inputs) z_support = self.cgpm_row_divide.support() lp_z_prior = [ self.cgpm_row_divide.logpdf(rowid, {self.indexer: z}, None, inputs_z) for z in z_support ] lp_constraints_likelihood = [ self._logpdf_one(rowid, constraints, None, inputs, z) for z in z_support ] lp_z_constraints = np.add(lp_z_prior, lp_constraints_likelihood) lp_z_posterior = log_normalize(lp_z_constraints) lp_targets_likelihood = [ self._logpdf_one(rowid, targets, constraints, inputs, z) for z in z_support ] lp_joint = np.add(lp_targets_likelihood, lp_z_posterior) return logsumexp(lp_joint)
# Joint equals chain rule for state 1. joint = state.logpdf(-1, {0: 1, 1: 2}) chain = state.logpdf(-1, {0: 1}, {1: 2}) + state.logpdf(-1, {1: 2}) assert np.allclose(joint, chain) if False: state2 = State(T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(12)) state2.transition(N=10, progress=1) # Joint equals chain rule for state 2. state2.logpdf(-1, {0: 1, 1: 2}) state2.logpdf(-1, {0: 1}, {1: 2}) + state2.logpdf(-1, {1: 2}) # Take the Monte Carlo average of the conditional. mc_conditional = np.log(.5) + gu.logsumexp( [state.logpdf(-1, {0: 1}, {1: 2}), state2.logpdf(-1, {0: 1}, {1: 2})]) # Take the Monte Carlo average of the joint. mc_joint = np.log(.5) + gu.logsumexp( [state.logpdf(-1, { 0: 1, 1: 2 }), state2.logpdf(-1, { 0: 1, 1: 2 })]) # Take the Monte Carlo average of the marginal. mc_marginal = np.log(.5) + gu.logsumexp( [state.logpdf(-1, {1: 2}),
def test_crp_same_table_probability(): """Compute probability of customers in same table. Given a single target rowid T and list of query rowids Q, compute the posterior probability that T and all rowids in Q are assigned to the same table, conditioned on all rowids in Q being assigned to the same table. Let S be the event of all rowids in Q are assigned to the same table: S = [zQ[0] = zQ[1] = ... = zQ[-1]] The first quantity of interest is; Pr[zT = zQ | S] = Pr[zT = zQ, S] / Pr[S] The numerator is: Pr[zT = zQ, S] = \sum_k Pr[zT=k, zQ=k,] where k is list of tables in the CRP plus a fresh singleton. The second quantity of interest is: Pr[zT \ne zQ |S] = Pr[zT \ne zQ, S] / Pr[S] The numerator is: Pr[zT \ne zQ, S] = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ] where kT is list of tables in the CRP plus a fresh singleton, and kQ|kT in the inner sum is all tables in the CRP other than kT (plus a fresh singleton when kT is itself a singleton). For example if the tables are [0, 1] then: kT = [0, 1, 2] kQ|kT = [[1, 2], [0, 2], [0,1,3] If computation is correct then the first and second quantities are equal to the normalizer, which is given by: Pr[S] = \sum_k Pr[zQ[0]=k, ..., zQ[-1]=k] where kQ is list of tables in the CRP plus a fresh singleton. """ crp = Crp(outputs=[0], inputs=None, hypers={'alpha': 1.5}, rng=gu.gen_rng(1)) assignments = [ (0, { 0: 0 }), (1, { 0: 0 }), (2, { 0: 2 }), (3, { 0: 2 }), (4, { 0: 2 }), (5, { 0: 2 }), (6, { 0: 6 }), (7, { 0: 6 }), (8, { 0: 7 }), ] for rowid, query in assignments: crp.incorporate(rowid, query) # Compute probability that (rowid=1) has same assignment of (rowid=4,6,7), # given that (rowid=4,6,7) have the same assignment. rowid_target = [1] rowid_query = [4, 6, 7] # Retrieve current assignment to restore later. assignment_target = [crp.data[r] for r in rowid_target] assignment_query = [crp.data[r] for r in rowid_query] # Retrieve CRP statistics to verify no mutation afterwards. logpdf_score_full = crp.logpdf_score() crp_data_full = crp.data for rowid in rowid_target + rowid_query: crp.unincorporate(rowid) # Final marginal likelihood after unincorporating. logpdf_score_truncated = crp.logpdf_score() # Retrieve current tables plus a singleton. tables_crp = sorted(crp.counts) # Exactly 1 target rowid requried. assert len(rowid_target) == 1 def retrieve_logpdf_assignments(rowid_target, rowid_query, t_target, t_query): for rowid in rowid_target: crp.incorporate(rowid, {crp.outputs[0]: t_target}) for rowid in rowid_query: crp.incorporate(rowid, {crp.outputs[0]: t_query}) lp_predictive = crp.logpdf_score() - logpdf_score_truncated for rowid in rowid_target + rowid_query: crp.unincorporate(rowid) return lp_predictive # Return list of tables to iterate over when query, target in same table. def get_tables_same(tables): singleton = max(tables) + 1 return tables + [singleton] # Return list of tables to iterate over when query, target in diff table. def get_tables_different(tables): singleton = max(tables) + 1 tables_query = tables + [singleton] auxiliary_table = lambda t: [] if t < singleton else [singleton + 1] tables_target = [ filter(lambda x: x != t, tables_query) + auxiliary_table(t) for t in tables_query ] return tables_query, tables_target # Some quick tests for get_tables_different. assert get_tables_different([0, 1]) == ([0, 1, 2], [[1, 2], [0, 2], [0, 1, 3]]) assert get_tables_different([1, 2]) == ([1, 2, 3], [[2, 3], [1, 3], [1, 2, 4]]) # Compute Pr[zT = zQ, zQ[0] = ... zQ[-1]]. tables_same = get_tables_same(tables_crp) logp_same_table = gu.logsumexp([ retrieve_logpdf_assignments(rowid_target, rowid_query, t, t) for t in tables_same ]) # Compute Pr[zT \ne zQ, zQ[0] = ... zQ[-1]]. tables_target, tables_query = get_tables_different(tables_crp) logp_diff_table = gu.logsumexp([ gu.logsumexp([ retrieve_logpdf_assignments(rowid_target, rowid_query, t_target, t_q) for t_q in t_query ]) for t_target, t_query in zip(tables_target, tables_query) ]) # Compute Pr[zT \ne zQ, zQ[0] = ... zQ[-1]] by switching order of sum. tables_query, tables_target = get_tables_different(tables_crp) logp_diff_table2 = gu.logsumexp([ gu.logsumexp([ retrieve_logpdf_assignments(rowid_query, rowid_target, t_query, t_t) for t_t in t_target ]) for t_query, t_target in zip(tables_query, tables_target) ]) # Confirm logp_diff_table is the same regardless of sum order. assert np.allclose(logp_diff_table, logp_diff_table2) # Compute Pr[zQ[0] = ... = zQ[-1]]. tables_condition = get_tables_same(tables_crp) logp_condition = gu.logsumexp([ retrieve_logpdf_assignments([], rowid_query, t, t) for t in tables_condition ]) # Confirm logp_same_table + logp_diff_table equal normalizing constant. assert np.allclose(gu.logsumexp([logp_same_table, logp_diff_table]), logp_condition) # Confirm direct spaces probabilities sum to one. p_same_table = np.exp(logp_same_table - logp_condition) p_diff_table = np.exp(logp_diff_table - logp_condition) assert np.allclose(p_same_table + p_diff_table, 1.0) # Restore assignments. for rowid, assignment in zip(rowid_target, assignment_target): crp.incorporate(rowid, {crp.outputs[0]: assignment}) for rowid, assignment in zip(rowid_query, assignment_query): crp.incorporate(rowid, {crp.outputs[0]: assignment}) # Confirm no mutation has occured. assert crp.data == crp_data_full assert crp.logpdf_score() == logpdf_score_full
def logpdf_marginal(self, z): return gu.logsumexp([ np.log(.5) + norm.logpdf(z, loc=mx, scale=self.noise) for mx in set(self.mx) ])
def logpdf_joint(self, x, y): return gu.logsumexp([ np.log(.25) + norm.logpdf(x, loc=mx, scale=self.noise) + norm.logpdf(y, loc=my, scale=self.noise) for (mx, my) in zip(self.mx, self.my) ])