def sample(self, data, partition, alpha): for item, data_point in enumerate(data): old_cell_index = partition.labels[item] partition.remove_item(item, old_cell_index) partition.remove_empty_cells() log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size log_p.append(log(counts) + cluster_log_p) params = self.base_measure.params cluster_log_p = self.posterior_density.log_p(data_point, params) log_p.append(log(alpha) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) if new_cell_index == partition.number_of_cells: partition.add_cell(self.base_measure.random()) partition.add_item(item, new_cell_index)
def _merge(self, old_cell_i, old_cell_j, data, partition): s_i = old_cell_i.items s_j = old_cell_j.items param_i = old_cell_i.value param_j = old_cell_j.value param_new = param_i forward_log_q = self.proposal_func.log_p(param_new, param_i) reverse_log_q = self.proposal_func.log_p(param_i, param_new) + self.proposal_func.log_p(param_j, param_new) new_cell = partition.add_cell(param_new) for k in s_i: old_cell_i.remove_item(k) new_cell.add_item(k) for k in s_j: old_cell_j.remove_item(k) new_cell.add_item(k) temp_s_i = set([s_i.pop(), ]) temp_s_j = set([s_j.pop(), ]) items = s_i + s_j shuffle(items) for k in items: n_i = len(temp_s_i) n_j = len(temp_s_j) log_p = [ log(n_i) + self.cluster_density.log_p(data[k], param_i), log(n_j) + self.cluster_density.log_p(data[k], param_j) ] log_p = log_space_normalise(log_p) if k in s_i: temp_s_i.add(k) reverse_log_q += log_p[0] else: temp_s_j.add(k) reverse_log_q += log_p[1] partition.remove_empty_cells() return new_cell, forward_log_q, reverse_log_q
def _compute_posterior(data, density, mesh_size): posterior = {} for cellular_prevalence in np.linspace(0, 1, mesh_size): posterior[cellular_prevalence] = 0 for data_point in data: posterior[cellular_prevalence] += density.log_p( data_point, BetaData(cellular_prevalence)) posterior = dict( zip(posterior.keys(), log_space_normalise(posterior.values()))) return posterior
def _split(self, i, j, old_cell, data, partition): old_cell.remove_item(i) old_cell.remove_item(j) param_i = old_cell.value param_j = self.proposal_func.random(param_i) forward_log_q = self.proposal_func.log_p(param_i, param_i) + self.proposal_func.log_p(param_j, param_i) reverse_log_q = self.proposal_func.log_p(param_i, param_i) new_cell_i = partition.add_cell(param_i) new_cell_j = partition.add_cell(param_j) new_cell_i.add_item(i) new_cell_j.add_item(j) s = old_cell.items shuffle(s) for k in s: old_cell.remove_item(k) n_i = new_cell_i.size n_j = new_cell_j.size log_p = [ log(n_i) + self.cluster_density.log_p(data[k], param_i), log(n_j) + self.cluster_density.log_p(data[k], param_j) ] log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] c_k = discrete_rvs(p) if c_k == 0: new_cell_i.add_item(k) else: new_cell_j.add_item(k) forward_log_q += log_p[c_k] partition.remove_empty_cells() return new_cell_i, new_cell_j, forward_log_q, reverse_log_q
def sample(self, data, partition, alpha, m=2): ''' Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models" ''' items = range(len(data)) shuffle(items) for item in items: data_point = data[item] old_cell_index = partition.get_cell_index(item) partition.remove_item(item, old_cell_index) if partition.counts[old_cell_index] == 0: num_new_tables = m - 1 else: num_new_tables = m for _ in range(num_new_tables): partition.add_cell(self.base_measure.random()) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size if counts == 0: counts = alpha / m log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) partition.add_item(item, new_cell_index) partition.remove_empty_cells()
def sample(self, data, partition, alpha, m=2): ''' Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models" ''' items = range(len(data)) shuffle(items) for item in items: data_point = data[item] old_cell_index = partition.labels[item] partition.remove_item(item, old_cell_index) if partition.counts[old_cell_index] == 0: num_new_tables = m - 1 else: num_new_tables = m for _ in range(num_new_tables): partition.add_cell(self.base_measure.random()) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size if counts == 0: counts = alpha / m log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) partition.add_item(item, new_cell_index) partition.remove_empty_cells()
def sample(self, data, partition, alpha): n = partition.number_of_items for item, data_point in enumerate(data): old_cluster_label = partition.labels[item] old_value = partition.item_values[item] partition.remove_item(item, old_cluster_label) if partition.counts[old_cluster_label] == 0: p = [x / (n - 1) for x in partition.counts] new_cluster_label = discrete_rvs(p) new_value = partition.cell_values[new_cluster_label] old_ll = self.cluster_density.log_p(data_point, old_value) new_ll = self.cluster_density.log_p(data_point, new_value) log_ratio = log(n - 1) - log(alpha) + new_ll - old_ll u = uniform_rvs(0, 1) if log_ratio >= log(u): partition.add_item(item, new_cluster_label) else: partition.add_item(item, old_cluster_label) else: new_value = self.base_measure.random() old_ll = self.cluster_density.log_p(data_point, old_value) new_ll = self.cluster_density.log_p(data_point, new_value) log_ratio = log(alpha) - log(n - 1) + new_ll - old_ll u = uniform_rvs(0, 1) if log_ratio >= log(u): partition.add_cell(new_value) cell = partition.get_cell_by_value(new_value) cell.add_item(item) else: partition.add_item(item, old_cluster_label) partition.remove_empty_cells() for item, data_point in enumerate(data): old_cluster_label = partition.labels[item] if partition.cells[old_cluster_label].size == 1: continue partition.remove_item(item, old_cluster_label) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cluster_label = discrete_rvs(p) partition.add_item(item, new_cluster_label) partition.remove_empty_cells()