def unobserve(self, rowid): obs_z, inputs_z = self.cgpm_row_divide.unobserve(rowid) obs_x, inputs_x = self.cgpm_components_array.unobserve(rowid) del self.rowid_to_component[rowid] observation = merged(obs_z, obs_x) inputs = merged(inputs_z, inputs_x) return observation, inputs
def _check_partitions(self): if not cu.check_env_debug(): return # For debugging only. assert self.alpha() > 0. # Check that the number of dims actually assigned to the view # matches the count in Nv. Zr = self.Zr() Nk = self.Nk() rowids = range(self.n_rows()) assert set(Zr.keys()) == set(rowids) assert set(Zr.values()) == set(Nk) for i, dim in self.dims.iteritems(): # Assert first output is first input of the Dim. assert self.outputs[0] == dim.inputs[0] # Assert length of dataset is the same as rowids. assert len(self.X[i]) == len(rowids) # Ensure number of clusters in each dim in views[v] # is the same and as described in the view (K, Nk). assignments = merged(dim.Zr, dim.Zi) assert assignments == Zr assert set(assignments.values()) == set(Nk.keys()) all_ks = dim.clusters.keys() + dim.Zi.values() assert set(all_ks) == set(Nk.keys()) for k in dim.clusters: # Law of conservation of rowids. rowids_k = [r for r in rowids if Zr[r] == k] cols = [dim.index] if dim.is_conditional(): cols.extend(dim.inputs[1:]) data = [[self.X[c][r] for c in cols] for r in rowids_k] rowids_nan = np.any(np.isnan(data), axis=1) if data else [] assert (dim.clusters[k].N + np.sum(rowids_nan) == Nk[k])
def observe_one(cgpm, rowid, observation, inputs): observation_cgpm = get_intersection(cgpm.outputs, observation) if observation_cgpm: inputs_cgpm_parents = get_intersection(cgpm.inputs, observation) inputs_cgpm_exog = get_intersection(cgpm.inputs, inputs) inputs_cgpm = merged(inputs_cgpm_parents, inputs_cgpm_exog) cgpm.observe(rowid, observation_cgpm, inputs_cgpm)
def test_incorporate_session(): rng = gu.gen_rng(4) state = State(X, cctypes=['normal'] * 5, Zv={ 0: 0, 1: 0, 2: 1, 3: 1, 4: 2 }, rng=rng) # Incorporate row into a singleton cluster for all views. previous = [len(state.views[v].Nk()) for v in [0, 1, 2]] data = {i: rng.normal() for i in xrange(5)} clusters = { state.views[0].outputs[0]: previous[0], state.views[1].outputs[0]: previous[1], state.views[2].outputs[0]: previous[2], } state.incorporate(state.n_rows(), gu.merged(data, clusters)) assert [len(state.views[v].Nk()) for v in [0,1,2]] == \ [p+1 for p in previous] # Incorporate row without specifying clusters, and some missing values data = {i: rng.normal() for i in xrange(2)} state.incorporate(state.n_rows(), data) state.transition(N=3) # Remove the incorporated rowid. state.unincorporate(state.n_rows() - 1) state.transition(N=3)
def test_simulate(seed): rng = gu.gen_rng(bytearray(seed)) iris = load_iris() indices = rng.uniform(0, 1, size=len(iris.data)) <= .75 Y_train = iris.data[indices] X_train = iris.target[indices] Y_test = iris.data[~indices] X_test = iris.target[~indices] forest = Dim(outputs=[5], inputs=[-1] + range(4), cctype='random_forest', distargs={ 'inputs': { 'stattypes': ['normal'] * 4 }, 'k': len(iris.target_names) }, rng=rng) forest.transition_hyper_grids(X_test) # Incorporate data into 1 cluster. for rowid, (x, y) in enumerate(zip(X_train, Y_train)): observation = {5: x} inputs = gu.merged({-1: 0}, {i: t for (i, t) in zip(range(4), y)}) forest.incorporate(rowid, observation, inputs) # Transitions. for _i in xrange(2): forest.transition_hypers() forest.transition_params() correct, total = 0, 0. for rowid, (x, y) in enumerate(zip(X_test, Y_test)): inputs = gu.merged({-1: 0}, {i: t for (i, t) in zip(range(4), y)}) samples = forest.simulate(None, [5], None, inputs, 10) prediction = np.argmax(np.bincount([s[5] for s in samples])) correct += (prediction == x) total += 1. # Classification should be better than random. assert correct / total > 1. / forest.distargs['k']
def state_simulate(state, rowid, targets, constraints=None, N=None): targets_lookup, constraints_lookup = partition_query_evidence( state.Zv(), targets, constraints) N_sim = N if N is not None else 1 draws = (view_simulate(view=state.views[v], rowid=rowid, targets=targets_lookup[v], constraints=constraints_lookup.get(v, {}), N=N_sim) for v in targets_lookup) samples = [merged(*l) for l in zip(*draws)] return samples if N is not None else samples[0]
def convert_view_to_rowmixture(view, rng): cgpms = [convert_dim_to_base_cgpm(d, rng) for d in view.dims.itervalues()] component_base_cgpms = Product(cgpms, rng=rng) cgpm_row_divide = convert_dim_to_base_cgpm(view.crp, rng) cgpm_row_mixture = FlexibleRowMixture(cgpm_row_divide, component_base_cgpms, rng=rng) for rowid, assignment in rebase_cgpm_row_assignments(view.Zr()): obs_z = {cgpm_row_divide.outputs[0]: assignment} obs_x = {c: view.X[c][rowid] for c in component_base_cgpms.outputs} observation = merged(obs_z, obs_x) cgpm_row_mixture.observe(rowid, observation) return cgpm_row_mixture
def _bulk_incorporate(self, dim): # XXX Major hack! We should really be creating new Dim objects. dim.clusters = {} # Mapping of cluster k to the object. dim.Zr = {} # Mapping of non-nan rowids to cluster k. dim.Zi = {} # Mapping of nan rowids to cluster k. dim.aux_model = dim.create_aux_model() for rowid, k in self.Zr().iteritems(): observation = {dim.index: self.X[dim.index][rowid]} inputs = self._get_input_values(rowid, dim, k) dim.incorporate(rowid, observation, inputs) assert merged(dim.Zr, dim.Zi) == self.Zr() dim.transition_params()
def simulate(self, rowid, targets, constraints=None, inputs=None, N=None): # Refer to comment in logpdf. constraints = self._populate_constraints(rowid, targets, constraints) if not self.hypothetical(rowid): rowid = None network = self.build_network() # Condition on the cluster assignment. if self.outputs[0] in constraints: return network.simulate(rowid, targets, constraints, inputs, N) # Determine how many samples to return. unwrap_result = N is None if unwrap_result: N = 1 # Expose cluster assignments to the samples? exposed = self.outputs[0] in targets if exposed: targets = [q for q in targets if q != self.outputs[0]] # Weight clusters by probability of constraints in each cluster. K = self.crp.clusters[0].gibbs_tables(-1) constr2 = [merged(constraints, {self.outputs[0]: k}) for k in K] lp_constraints_unorm = [network.logpdf(rowid, ev) for ev in constr2] # Find number of samples in each cluster. Ks = gu.log_pflip(lp_constraints_unorm, array=K, size=N, rng=self.rng) counts = {k: n for k, n in enumerate(np.bincount(Ks)) if n > 0} # Add the cluster assignment to the constraints and sample the rest. constr3 = { k: merged(constraints, {self.outputs[0]: k}) for k in counts } samples = [ network.simulate(rowid, targets, constr3[k], inputs, counts[k]) for k in counts ] # If cluster assignments are exposed, append them to the samples. if exposed: samples = [[merged(l, {self.outputs[0]: k}) for l in s] for s, k in zip(samples, counts)] # Return 1 sample if N is None, otherwise a list. result = list(itertools.chain.from_iterable(samples)) return result[0] if unwrap_result else result
def _logpdf_one(self, rowid, targets, constraints, inputs, component): """Assess logpdf in fixed mixture component.""" targets_x = get_intersection(self.outputs_x, targets) if not targets_x: return 0 constraints_x = get_intersection(self.outputs_x, constraints) inputs_x = get_intersection(self.outputs_x, inputs) inputs_arr = merged(inputs_x, {self.indexer: component}) return self.cgpm_components_array.logpdf( rowid=rowid, targets=targets_x, constraints=constraints_x, inputs=inputs_arr, )
def populate_constraints(self, rowid, targets, constraints): if constraints is None: constraints = {} if rowid in self.data: values = self.data[rowid] assert len(values) == len(self.outputs) observations = { output: value for output, value in zip(self.outputs, values) if not np.isnan(value) and output not in targets and output not in constraints } constraints = gu.merged(constraints, observations) return constraints
def test_transition_hypers(): forest = Dim(outputs=RF_OUTPUTS, inputs=[-1] + RF_INPUTS, cctype='random_forest', distargs=RF_DISTARGS, rng=gu.gen_rng(0)) forest.transition_hyper_grids(D[:, 0]) # Create two clusters. Zr = np.zeros(len(D), dtype=int) Zr[len(D) / 2:] = 1 for rowid, row in enumerate(D[:25]): observation = {0: row[0]} inputs = gu.merged({i: row[i] for i in forest.inputs}, {-1: Zr[rowid]}) forest.incorporate(rowid, observation, inputs)
def mutual_information(cgpm, targets0, targets1, constraints=None, marginalize=None, T=None, N=None): _validate_query(cgpm.outputs,targets0, targets1, constraints, marginalize) N = N or DEFAULT_SAMPLES_MONTE_CARLO T = T or DEFAULT_SAMPLES_MARGINALIZE estimator = _get_estimator(targets0, targets1) if not marginalize: samples_mi = estimator(cgpm, targets0, targets1, constraints, N) else: samples_marginalize = cgpm.simulate(None, marginalize, N=T) constraints_cm = [merged(constraints, m) for m in samples_marginalize] estimates = [estimator(cgpm, targets0, targets1, constraint_cm, N) for constraint_cm in constraints_cm] samples_mi = itertools.chain.from_iterable(estimates) return get_estimate(samples_mi)
def observe(self, rowid, observation, inputs=None): if rowid in self.rowid_to_component: component = {self.indexer: self.rowid_to_component[rowid]} else: inputs_z = get_intersection(self.inputs_z, inputs) if self.indexer in observation: component = {self.indexer: observation[self.indexer]} else: component = self.cgpm_row_divide.simulate( rowid, [self.indexer], inputs_z) inputs_z = get_intersection(self.inputs_z, inputs) self.cgpm_row_divide.observe(rowid, component, inputs_z) self.rowid_to_component[rowid] = component[self.indexer] inputs_x = get_intersection(self.inputs_x, inputs) observation_x = get_intersection(self.outputs_x, observation) inputs_arr = merged(inputs_x, component) self.cgpm_components_array.observe(rowid, observation_x, inputs_arr)
def _simulate_fallback(self, rowid, targets, N): # Fallback: if there is no such constraints to resample from, then # resample the first variable. merged = len(targets) == len(self.outputs) targets_dummy = [o for o in self.outputs if o not in targets] if merged: assert not targets_dummy targets_dummy = [targets[0]] targets = targets[1:] dataset = self._dataset(targets_dummy) indices = self.rng.choice(len(dataset), size=N) constraints = [zip(targets_dummy, dataset[i]) for i in indices] results = [self.simulate(rowid, targets, dict(e)) for e in constraints] # Make sure to add back the resampled first target variable to results. if merged: results = [gu.merged(s, e) for s, e in zip(results, constraints)] return results
def logpdf(self, rowid, targets, constraints=None, inputs=None): constraints = constraints or {} inputs = inputs or {} # Compute joint probability. _samples_joint, weights_joint = zip(*[ self.weighted_sample(rowid, [], merged(targets, constraints), inputs) for _i in xrange(self.accuracy) ]) logp_joint = logmeanexp(weights_joint) # Compute marginal probability. _samples_marginal, weights_marginal = zip(*[ self.weighted_sample(rowid, [], constraints, inputs) for _i in xrange(self.accuracy) ]) if constraints else ({}, [0.]) if all(isinf(l) for l in weights_marginal): raise ValueError('Zero density constraints: %s' % (constraints, )) logp_constraints = logmeanexp(weights_marginal) # Return log ratio. return logp_joint - logp_constraints
def _populate_constraints(self, rowid, targets, constraints): """Loads constraints from the dataset.""" if constraints is None: constraints = {} self._validate_cgpm_query(rowid, targets, constraints) # If the rowid is hypothetical, just return. if self.hypothetical(rowid): return constraints # Retrieve all values for this rowid not in targets or constraints. data = { c: self.X[c][rowid] for c in self.outputs[1:] if \ c not in targets \ and c not in constraints \ and not isnan(self.X[c][rowid]) } # Add the cluster assignment. data[self.outputs[0]] = self.Zr(rowid) return merged(constraints, data)
def get_view_observes(view): rowids = get_rowids(view) # Handle observe for component assignment cgpm. cgpm_crp = view.cgpm_row_divide observe_crp = OrderedDict([(rowid, get_primitive_observes(cgpm_crp, rowid)) for rowid in rowids]) observe_crp_reindex = reindex_crp_observes(observe_crp.values()) sorted_rowids = get_sorted_rowids(rowids, observe_crp_reindex) rowid_to_index = {rowid: i for i, rowid in enumerate(sorted_rowids)} observe_crp_sorted = [ observe_crp_reindex[rowid_to_index[rowid]] for rowid in sorted_rowids ] # Handle observe for component data cgpm. cgpm_components = view.cgpm_components_array observe_components_sorted = [ get_components_observes(cgpm_components, rowid) for rowid in sorted_rowids ] # Return overall row-wise observation. return OrderedDict([(rowid, merged(i0, i1)) for rowid, i0, i1 in zip( sorted_rowids, observe_crp_sorted, observe_components_sorted)])
def _simulate_one(self, rowid, targets, constraints, inputs, N, component): """Simulate from a fixed mixture component.""" targets_x = get_intersection(self.outputs_x, targets) if targets_x: constraints_x = get_intersection(self.outputs_x, constraints) inputs_x = get_intersection(self.outputs_x, inputs) inputs_arr = merged(inputs_x, {self.indexer: component}) samples = self.cgpm_components_array.simulate( rowid=rowid, targets=targets_x, constraints=constraints_x, inputs=inputs_arr, N=N, ) else: samples = {} if N is None else [{}] * N if N is None and self.indexer in targets: samples[self.indexer] = component elif N is not None and self.indexer in targets: for sample in samples: sample[self.indexer] = component return samples
def logpdf(self, rowid, targets, constraints=None, inputs=None): if rowid in self.rowid_to_component: # Condition on the cluster assignment directly. # p(xT|xC,z=k) assert not constraints or self.indexer not in constraints z = self.rowid_to_component[rowid] return self._logpdf_one(rowid, targets, constraints, inputs, z) elif self.indexer in targets: # Query the cluster assignment. # p(z=k,xT|xC) # = p(z=k,xT,xC) / p(xC) Bayes rule # = p(z=k)p(xT,xC|z=k) / p(xC) chain rule on numerator # The terms are then: # p(z=k) lp_z # p(xT,xC|z=k) lp_x_joint # p(xC) = \sum_z P(xC,z) lp_x_constraints (recursively) z = targets[self.indexer] inputs_z = get_intersection(self.inputs_z, inputs) lp_z = self.cgpm_row_divide.logpdf(rowid=rowid, targets={self.indexer: z}, constraints=None, inputs=inputs_z) targets_joint = merged(targets, constraints or {}) lp_x_joint = self._logpdf_one(rowid=rowid, targets=targets_joint, constraints=None, inputs=inputs, component=z) lp_x_constraints = self.logpdf(rowid=rowid, targets=constraints, constraints=None, inputs=inputs) if constraints else 0 return (lp_z + lp_x_joint) - lp_x_constraints elif constraints and self.indexer in constraints: # Condition on the cluster assignment # P(xT|xC,z=k) # = P(xT,xC,z=k) / P(xC,z=k) # = P(xT,xC|z=k)P(z=k) / P(xC|z=k) # = P(xT,xC|z=k) / P(xC|z=k) # The terms are then: # P(xT,xC|z=k) lp_x_joint # P(xC|z=k) lp_x_constraints z = constraints[self.indexer] if z not in self.cgpm_row_divide.support(): raise ValueError('Constrained cluster has 0 density: %s' % (z, )) targets_joint = merged(targets, constraints) lp_x_joint = self._logpdf_one(rowid=rowid, targets=targets_joint, constraints=None, inputs=inputs, component=z) lp_x_constraints = self._logpdf_one(rowid=rowid, targets=constraints, constraints=None, inputs=inputs, component=z) return lp_x_joint - lp_x_constraints else: # Marginalize over cluster assignment by enumeration. # Let K be a list of values for the support of z: # P(xT|xC) # = \sum_i P(xT,z=K[i]|xC) # = \sum_i P(xT|xC,z=K[i])P(z=K[i]|xC) chain rule # # The posterior is given by: # P(z=K[i]|xC) = P(xC|z=K[i])P(z=K[i]) / \sum_i P(xC,z=K[i]) # # The terms are therefore # P(z=K[i]) lp_z_prior[i] # P(xC|z=K[i]) lp_constraints_likelihood[i] # P(xC,z=K[i]) lp_z_constraints[i] # P(z=K[i]|xC) lp_z_posterior[i] # P(xT|xC,z=K[i]) lp_targets_likelihood[i] # P(xT|xC,z=K[i])P(z=K[i]|xC) lp_joint[i] inputs_z = get_intersection(self.inputs_z, inputs) z_support = self.cgpm_row_divide.support() lp_z_prior = [ self.cgpm_row_divide.logpdf(rowid, {self.indexer: z}, None, inputs_z) for z in z_support ] lp_constraints_likelihood = [ self._logpdf_one(rowid, constraints, None, inputs, z) for z in z_support ] lp_z_constraints = np.add(lp_z_prior, lp_constraints_likelihood) lp_z_posterior = log_normalize(lp_z_constraints) lp_targets_likelihood = [ self._logpdf_one(rowid, targets, constraints, inputs, z) for z in z_support ] lp_joint = np.add(lp_targets_likelihood, lp_z_posterior) return logsumexp(lp_joint)
def _get_input_values(self, rowid, dim, k): """Prepare the inputs for a Dim logpdf or simulate query.""" inputs = {i: self.X[i][rowid] for i in dim.inputs[1:]} cluster = {self.outputs[0]: k} return merged(inputs, cluster)
def _migrate_row(self, rowid, k): self.unincorporate(rowid) observation = merged({d: self.X[d][rowid] for d in self.dims}, {self.outputs[0]: k}) self.incorporate(rowid, observation)
def logpdf(self, rowid, targets, constraints=None, inputs=None): # As discussed in https://github.com/probcomp/cgpm/issues/116 for an # observed rowid, we synthetize a new hypothetical row which is # identical (in terms of observed and latent values) to the observed # rowid. In this version of the implementation, the user may not # override any non-null values in the observed rowid # (_populate_constraints returns an error in this case). A user should # either (i) use another rowid, since overriding existing values in the # observed rowid no longer specifies that rowid, or (ii) use some # sequence of incorporate/unicorporate depending on their query. constraints = self._populate_constraints(rowid, targets, constraints) if not self.hypothetical(rowid): rowid = None # Prepare the importance network. network = self.build_network() if self.outputs[0] in constraints: # Condition on the cluster assignment. # p(xT|xC,z=k) computed directly by network. return network.logpdf(rowid, targets, constraints, inputs) elif self.outputs[0] in targets: # Query the cluster assignment. # p(z=k,xT|xC) # = p(z=k,xT,xC) / p(xC) Bayes rule # = p(z=k)p(xT,xC|z=k) / p(xC) chain rule on numerator # The terms are then: # p(z=k) lp_cluster # p(xT,xC|z=k) lp_numer # p(xC) lp_denom k = targets[self.outputs[0]] constraints_z = {self.outputs[0]: k} targets_nz = { c: targets[c] for c in targets if c != self.outputs[0] } targets_numer = merged(targets_nz, constraints) lp_cluster = network.logpdf(rowid, constraints_z, inputs) lp_numer = \ network.logpdf(rowid, targets_numer, constraints_z, inputs) \ if targets_numer else 0 lp_denom = self.logpdf(rowid, constraints) if constraints else 0 return (lp_cluster + lp_numer) - lp_denom else: # Marginalize over cluster assignment by enumeration. # Let K be a list of values for the support of z: # P(xT|xC) # = \sum_k p(xT|z=k,xC)p(z=k|xC) marginalization # Now consider p(z=k|xC) \propto p(z=k,xC) Bayes rule # p(z=K[i],xC) lp_constraints_unorm[i] # p(z=K[i]|xC) lp_constraints[i] # p(xT|z=K[i],xC) lp_targets[i] K = self.crp.clusters[0].gibbs_tables(-1) constraints = [ merged(constraints, {self.outputs[0]: k}) for k in K ] lp_constraints_unorm = [ network.logpdf(rowid, const, None, inputs) for const in constraints ] lp_constraints = gu.log_normalize(lp_constraints_unorm) lp_targets = [ network.logpdf(rowid, targets, const, inputs) for const in constraints ] return gu.logsumexp(np.add(lp_constraints, lp_targets))
def _simulate_row(view, targets, cluster, N): """Return sample of the targets in a fixed cluster.""" samples = (view.dims[c].simulate(None, [c], None, {view.outputs[0]: cluster}, N) for c in targets) return (merged(*l) for l in zip(*samples))