예제 #1
0
    def _split(self, data, constrained_clustering=None):
        particle = self.kernel.create_particle(0, data[0], None, log_q={0: 0})

        particle = self.kernel.create_particle(1,
                                               data[1],
                                               particle,
                                               log_q={1: 0})

        if constrained_clustering is None:
            for data_point in data[2:]:
                particle = self.kernel.propose(data_point, particle)

        else:
            constrained_clustering = relabel_clustering(constrained_clustering)

            for block_idx, data_point in zip(constrained_clustering[2:],
                                             data[2:]):
                particle = self.kernel.create_particle(block_idx, data_point,
                                                       particle)

        clustering = get_cluster_labels(particle)

        init_params = [
            self.dist.create_params_from_data(data[0]),
            self.dist.create_params_from_data(data[1]),
        ]

        log_mh_factor = get_log_normalisation(
            particle) + self.kernel.log_target_density(init_params)

        return clustering, log_mh_factor
예제 #2
0
    def _get_updated_clustering(self, clustering, particle, sigma):
        restricted_clustering = get_cluster_labels(particle)

        max_idx = clustering.max()

        clustering[sigma] = restricted_clustering + max_idx + 1

        return relabel_clustering(clustering)
예제 #3
0
    def update(self, clustering):
        clustering = relabel_clustering(clustering)

        clusters = np.unique(clustering)

        num_clusters = len(np.unique(clustering))

        self.cluster_probs = np.zeros((num_clusters, num_clusters))

        self.clusters_to_data = {}

        self.data_to_clusters = {}

        margs = {}

        for c in clusters:
            cluster_data = self.data[clustering == c]

            cluster_params = self.dist.create_params_from_data(cluster_data)

            margs[c] = self.dist.log_marginal_likelihood(cluster_params)

            if self.use_prior_weight:
                margs[c] += self.partition_prior.log_tau_2(cluster_params.N)

            self.clusters_to_data[c] = np.where(clustering == c)[0].flatten()

            for i in self.clusters_to_data[c]:
                self.data_to_clusters[i] = c

        for c_i in clusters:
            log_p = np.ones(num_clusters) * float('-inf')

            for c_j in clusters:
                if c_i == c_j:
                    continue

                merged_data = self.data[(clustering == c_i) |
                                        (clustering == c_j)]

                merged_params = self.dist.create_params_from_data(merged_data)

                merge_marg = self.dist.log_marginal_likelihood(merged_params)

                if self.use_prior_weight:
                    merge_marg += self.partition_prior.log_tau_2(
                        merged_params.N)

                log_p[c_j] = merge_marg - (margs[c_i] + margs[c_j])

            if num_clusters == 1:
                log_p[c_i] = 0

            else:
                log_p[c_i] = -np.log(num_clusters - 1) + log_sum_exp(log_p)

            self.cluster_probs[c_i], _ = exp_normalize(log_p)
예제 #4
0
    def _sample(self, clustering, data):
        anchors, sigma = self.split_merge_setup_kernel.setup_split_merge(
            clustering, 2)

        self.kernel.setup(anchors,
                          clustering,
                          data,
                          sigma,
                          set_constrained_path=False)

        clustering_sigma = clustering[sigma]

        data_sigma = data[sigma]

        propose_merge = (clustering_sigma[0] != clustering_sigma[1])

        if propose_merge:
            merge_clustering, merge_mh_factor = self._merge(data_sigma)

            split_clustering, split_mh_factor = self._split(
                data_sigma, constrained_clustering=clustering_sigma)

            forward_factor = merge_mh_factor

            reverse_factor = split_mh_factor

            restricted_clustering = merge_clustering

        else:
            merge_clustering, merge_mh_factor = self._merge(data_sigma)

            split_clustering, split_mh_factor = self._split(data_sigma)

            forward_factor = split_mh_factor

            reverse_factor = merge_mh_factor

            restricted_clustering = split_clustering

        log_ratio = forward_factor - reverse_factor

        #         print split_mh_factor, merge_mh_factor, log_ratio

        u = np.random.random()

        if log_ratio >= np.log(u):
            max_idx = clustering.max()

            clustering[sigma] = restricted_clustering + max_idx + 1

            clustering = relabel_clustering(clustering)

        return clustering
예제 #5
0
def get_constrained_path(clustering, data, kernel):
    constrained_path = []

    clustering = relabel_clustering(clustering)

    particle = None

    for c, x in zip(clustering, data):
        particle = kernel.create_particle(c, x, particle)

        constrained_path.append(particle)

    return constrained_path
예제 #6
0
    def sample(self, clustering, data, num_iters=1):
        for _ in range(num_iters):
            anchors, sigma = self._setup_split_merge(clustering)

            self.smc_kernel.setup(anchors, clustering, data, sigma)

            particles_weights = self.smc_sampler.sample(
                data[sigma], self.smc_kernel)

            sampled_particle = self._sample_particle(particles_weights)

            self._get_updated_clustering(clustering, sampled_particle, sigma)

            clustering = relabel_clustering(clustering)

        return clustering
예제 #7
0
def get_exact_posterior(data, dist, partition_prior):
    '''
    Compute the exact posterior of the clustering model.

    Returns a dictionary mapping clusterings to posterior probability.
    '''
    log_p = []

    clusterings = []

    for c in get_all_clusterings(data.shape[0]):
        clusterings.append(tuple(relabel_clustering(c).astype(int)))

        log_p.append(log_joint_probability(c, data, dist, partition_prior))

    p, _ = exp_normalize(np.array(log_p))

    return dict(zip(clusterings, p))
예제 #8
0
    def update(self, clustering):
        self.cluster_params = {}

        self.clusters_to_data = {}

        self.data_to_clusters = {}

        self.clustering = relabel_clustering(clustering)

        for c in np.unique(clustering):
            cluster_data = self.data[clustering == c]

            self.cluster_params[c] = self.dist.create_params()

            for data_point in cluster_data:
                self.cluster_params[c].increment(data_point)

            self.clusters_to_data[c] = np.where(clustering == c)[0].flatten()
예제 #9
0
    def _run_sampler_posterior(self,
                               data,
                               sampler,
                               burnin=int(1e2),
                               num_iters=int(1e4)):
        clustering = np.zeros(data.shape[0], dtype=int)

        test_counts = Counter()

        for i in range(num_iters):
            clustering = sampler.sample(clustering, data)

            if i >= burnin:
                test_counts[tuple(relabel_clustering(clustering))] += 1

        posterior_probs = defaultdict(float)

        norm_const = sum(test_counts.values())

        for key in test_counts:
            posterior_probs[key] = test_counts[key] / norm_const

        return posterior_probs