Пример #1
0
    def k_means(self, threshold=1e-30, n_iters=20):
        # step 2
        m = len(self.data)
        self.pi_cluster = [None] * self.clusters.shape[0]
        prev_assn = np.zeros((1, m))
        for iter_ in range(n_iters):
            # for each cluster calculate its "centroid", denoted as p(C|W_j) in the paper
            # then calculate the distance of the current cluster W_j to each point.
            # If the distance between any point to the current cluster is smaller than the distance to the previous cluster
            # then change the assignment of that point to the current cluster
            for idx, cluster in enumerate(self.clusters):
                if len(cluster) != 0:
                    p_c_given_cluster_j = self.cal_cluster_centroid(
                        np.array(cluster))
                    self.pi_cluster[idx] = p_c_given_cluster_j
            self.pi_cluster = np.asarray(self.pi_cluster)
            self.pi_cluster = self.pi_cluster.reshape(
                (self.pi_cluster.shape[0], -1))

            # maximization step
            prev_assn = self.assignment
            for idx, d in enumerate(self.data):
                gini_distance = self.cal_distance(d, self.pi_cluster)
                new_assn = np.argmin(gini_distance)
                self.assignment[idx] = new_assn
            self.clusters = convert_assignment_to_clusters(
                self.assignment, self.data)
            self.gini, _, _ = gini(self.assignment, self.data, self.k)
Пример #2
0
    def __init__(self, k=0, data=None, assignment=None, seed=None):
        # data is always the joint distribution with rows being words and columns being classes
        # However, in this paper, all the data points are from conditional probability distribution
        # In the paper l is the number of document classes, here we will use n to denote it
        # marginalize P(C)
        if not seed is None:
            np.random.seed(seed)

        self.data = data
        self.k = k
        self.gini = 0
        """Initialize DC following the original paper"""
        # initial assignment p(c_j|w_t) = maxi p(c_i|w_t), k = n
        data = normalize(data)
        self.assignment = self.argmax_randtie_masking_generic(data, axis=1)

        clusters = convert_assignment_to_clusters(self.assignment, self.data)
        self.clusters = clusters

        _, n = data.shape
        if k > n:
            # split each cluster arbitrarily into at least floor(k/l) clusters
            n_to_split = k // n
            new_clusters = []
            for cluster in clusters:
                splited_arrs = np.array_split(np.array(cluster), n_to_split)
                new_clusters += splited_arrs
                self.clusters = new_clusters
        elif k < n:
            for i in range(k, len(clusters)):
                clusters[k - 1] += clusters[i]
            self.clusters = clusters[:k]
        self.clusters = np.asarray(self.clusters)
Пример #3
0
def partition_entropy_rg(assignment, data, k):
    # weighted entropy in Cicalese et. al. 2019
    m, n = data.shape
    clusters = convert_assignment_to_clusters(assignment, data)
    ans = 0
    for cluster in clusters:
        if len(cluster) != 0:
            sum_cluster = np.sum(cluster)
            cluster = np.sum(cluster, axis=0)
            ans -= np.sum(cluster * np.log2(cluster / sum_cluster))
    return ans
Пример #4
0
    def k_means(self, threshold=1e-30, n_iters=50):
        # step 2
        m = len(self.data)
        self.pi_cluster = [None] * self.clusters.shape[0]
        prev_q = 0
        # prev_assn = np.zeros((1, m))
        for iter_ in range(n_iters):
            # print(iter_)
            # expectation
            q = self.cal_q(self.clusters, self.data)
            # print(q)
            diff = q - prev_q
            prev_q = q
            if np.abs(diff) > threshold:
                # converge = np.array_equal(prev_assn, self.assignment)
                # if not converge:
                # for each cluster calculate its "centroid", denoted as p(C|W_j) in the paper
                # then calculate the distance of the current cluster W_j to each point.
                # If the distance between any point to the current cluster is smaller than the distance to the previous cluster
                # then change the assignment of that point to the current cluster
                for idx, cluster in enumerate(self.clusters):
                    if len(cluster) != 0:
                        p_c_given_cluster_j = self.cal_cluster(
                            np.array(cluster))
                        self.pi_cluster[idx] = p_c_given_cluster_j
                self.pi_cluster = np.asarray(self.pi_cluster)
                self.pi_cluster = self.pi_cluster.reshape(
                    (self.pi_cluster.shape[0], -1))

                prev_assn = np.copy(self.assignment)
                # maximization step
                for idx, d in enumerate(self.data):
                    d = d.T
                    kl_div = self.cal_kl_div_from_pt_to_centroids(
                        d, np.array(self.pi_cluster))
                    new_assn = np.argmin(kl_div)
                    self.assignment[idx] = new_assn
                self.clusters = convert_assignment_to_clusters(
                    self.assignment, self.data)

                self.impurity, e, r_max = partition_entropy(self.assignment,
                                                            self.data,
                                                            converted=False)
            else:
                break
Пример #5
0
def partition_entropy(assignment, data, k):
    m, n = data.shape

    clusters = convert_assignment_to_clusters(assignment, data)
    eps = 1e-15
    ans = 0
    e = 0

    for cluster in clusters:
        if len(cluster) != 0:
            p_z = np.sum(cluster)
            p_x_z_joint = np.sum(cluster, axis=0)
            p_x_given_z = p_x_z_joint / p_z
            e += p_z * np.max(p_x_given_z)
            ans += p_z * entropy(p_x_given_z, base=2)

    U = f(e) + (n - 1) * f((1 - e) / (n - 1))  # upperbound for e
    L = g(e)  # lowerbound for e

    return ans, e, U / L
Пример #6
0
    def __init__(self,
                 init_type="nguyen",
                 k=0,
                 data=None,
                 assignment=None,
                 seed=None):
        # data is always the joint distribution with rows being words and columns being classes
        # However, in this paper, all the data points are from conditional probability distribution
        # In the paper l is the number of document classes, here we will use n to denote it
        # marginalize P(C)
        if not seed is None:
            np.random.seed(seed)

        self.data = data

        self.init_type = init_type
        self.k = k

        self.entropy = 0
        self.impurity = 0

        if init_type == "nguyen":
            """Initialize DC with the result of Nguyen 2020"""
            if not (assignment is None):
                self.assignment = assignment
                self.clusters = np.asarray(
                    convert_assignment_to_clusters(self.assignment, data))
                self.k_means(threshold=1e-30, n_iters=1)
                # self.entropy = partition_entropy_rg(self.assignment, self.data, self.k)
                self.impurity, _, _ = partition_entropy(self.assignment,
                                                        self.data,
                                                        self.k,
                                                        converted=False)
            else:
                assert not (assignment is None)
        else:
            """Initialize DC following the original paper"""
            # initial assignment p(c_j|w_t) = maxi p(c_i|w_t), k = n
            # self.assignment = np.argmax(data, axis = 1)
            # self.assignment = self.argmax_randtie_masking_generic(data, axis = 1)
            self.assignment = self.argmax_randtie_masking_generic(data, axis=1)
            clusters = convert_assignment_to_clusters(self.assignment,
                                                      self.data)
            self.clusters = clusters

            _, n = data.shape
            if k > n:
                # split each cluster arbitrarily into at least floor(k/l) clusters
                n_to_split = k // n
                new_clusters = []
                for cluster in clusters:
                    if len(cluster) > n_to_split:
                        splited_arrs = np.array_split(np.array(cluster),
                                                      n_to_split)
                        new_clusters += splited_arrs
                while len(new_clusters) < k:
                    len_list = [
                        len(new_clusters[i]) for i in range(len(new_clusters))
                    ]
                    max_idx = np.argmax(len_list)
                    splited_arrs = np.array_split(np.array(cluster),
                                                  n_to_split)
                    new_clusters += splited_arrs
                self.clusters = new_clusters
            elif k < n:
                for i in range(k, len(clusters)):
                    clusters[k - 1] += clusters[i]
                self.clusters = clusters[:k]
            self.clusters = np.asarray(self.clusters)
            impurity = partition_entropy_rg(self.assignment, self.data, self.k)