def k_means(self, threshold=1e-30, n_iters=20): # step 2 m = len(self.data) self.pi_cluster = [None] * self.clusters.shape[0] prev_assn = np.zeros((1, m)) for iter_ in range(n_iters): # for each cluster calculate its "centroid", denoted as p(C|W_j) in the paper # then calculate the distance of the current cluster W_j to each point. # If the distance between any point to the current cluster is smaller than the distance to the previous cluster # then change the assignment of that point to the current cluster for idx, cluster in enumerate(self.clusters): if len(cluster) != 0: p_c_given_cluster_j = self.cal_cluster_centroid( np.array(cluster)) self.pi_cluster[idx] = p_c_given_cluster_j self.pi_cluster = np.asarray(self.pi_cluster) self.pi_cluster = self.pi_cluster.reshape( (self.pi_cluster.shape[0], -1)) # maximization step prev_assn = self.assignment for idx, d in enumerate(self.data): gini_distance = self.cal_distance(d, self.pi_cluster) new_assn = np.argmin(gini_distance) self.assignment[idx] = new_assn self.clusters = convert_assignment_to_clusters( self.assignment, self.data) self.gini, _, _ = gini(self.assignment, self.data, self.k)
def __init__(self, k=0, data=None, assignment=None, seed=None): # data is always the joint distribution with rows being words and columns being classes # However, in this paper, all the data points are from conditional probability distribution # In the paper l is the number of document classes, here we will use n to denote it # marginalize P(C) if not seed is None: np.random.seed(seed) self.data = data self.k = k self.gini = 0 """Initialize DC following the original paper""" # initial assignment p(c_j|w_t) = maxi p(c_i|w_t), k = n data = normalize(data) self.assignment = self.argmax_randtie_masking_generic(data, axis=1) clusters = convert_assignment_to_clusters(self.assignment, self.data) self.clusters = clusters _, n = data.shape if k > n: # split each cluster arbitrarily into at least floor(k/l) clusters n_to_split = k // n new_clusters = [] for cluster in clusters: splited_arrs = np.array_split(np.array(cluster), n_to_split) new_clusters += splited_arrs self.clusters = new_clusters elif k < n: for i in range(k, len(clusters)): clusters[k - 1] += clusters[i] self.clusters = clusters[:k] self.clusters = np.asarray(self.clusters)
def partition_entropy_rg(assignment, data, k): # weighted entropy in Cicalese et. al. 2019 m, n = data.shape clusters = convert_assignment_to_clusters(assignment, data) ans = 0 for cluster in clusters: if len(cluster) != 0: sum_cluster = np.sum(cluster) cluster = np.sum(cluster, axis=0) ans -= np.sum(cluster * np.log2(cluster / sum_cluster)) return ans
def k_means(self, threshold=1e-30, n_iters=50): # step 2 m = len(self.data) self.pi_cluster = [None] * self.clusters.shape[0] prev_q = 0 # prev_assn = np.zeros((1, m)) for iter_ in range(n_iters): # print(iter_) # expectation q = self.cal_q(self.clusters, self.data) # print(q) diff = q - prev_q prev_q = q if np.abs(diff) > threshold: # converge = np.array_equal(prev_assn, self.assignment) # if not converge: # for each cluster calculate its "centroid", denoted as p(C|W_j) in the paper # then calculate the distance of the current cluster W_j to each point. # If the distance between any point to the current cluster is smaller than the distance to the previous cluster # then change the assignment of that point to the current cluster for idx, cluster in enumerate(self.clusters): if len(cluster) != 0: p_c_given_cluster_j = self.cal_cluster( np.array(cluster)) self.pi_cluster[idx] = p_c_given_cluster_j self.pi_cluster = np.asarray(self.pi_cluster) self.pi_cluster = self.pi_cluster.reshape( (self.pi_cluster.shape[0], -1)) prev_assn = np.copy(self.assignment) # maximization step for idx, d in enumerate(self.data): d = d.T kl_div = self.cal_kl_div_from_pt_to_centroids( d, np.array(self.pi_cluster)) new_assn = np.argmin(kl_div) self.assignment[idx] = new_assn self.clusters = convert_assignment_to_clusters( self.assignment, self.data) self.impurity, e, r_max = partition_entropy(self.assignment, self.data, converted=False) else: break
def partition_entropy(assignment, data, k): m, n = data.shape clusters = convert_assignment_to_clusters(assignment, data) eps = 1e-15 ans = 0 e = 0 for cluster in clusters: if len(cluster) != 0: p_z = np.sum(cluster) p_x_z_joint = np.sum(cluster, axis=0) p_x_given_z = p_x_z_joint / p_z e += p_z * np.max(p_x_given_z) ans += p_z * entropy(p_x_given_z, base=2) U = f(e) + (n - 1) * f((1 - e) / (n - 1)) # upperbound for e L = g(e) # lowerbound for e return ans, e, U / L
def __init__(self, init_type="nguyen", k=0, data=None, assignment=None, seed=None): # data is always the joint distribution with rows being words and columns being classes # However, in this paper, all the data points are from conditional probability distribution # In the paper l is the number of document classes, here we will use n to denote it # marginalize P(C) if not seed is None: np.random.seed(seed) self.data = data self.init_type = init_type self.k = k self.entropy = 0 self.impurity = 0 if init_type == "nguyen": """Initialize DC with the result of Nguyen 2020""" if not (assignment is None): self.assignment = assignment self.clusters = np.asarray( convert_assignment_to_clusters(self.assignment, data)) self.k_means(threshold=1e-30, n_iters=1) # self.entropy = partition_entropy_rg(self.assignment, self.data, self.k) self.impurity, _, _ = partition_entropy(self.assignment, self.data, self.k, converted=False) else: assert not (assignment is None) else: """Initialize DC following the original paper""" # initial assignment p(c_j|w_t) = maxi p(c_i|w_t), k = n # self.assignment = np.argmax(data, axis = 1) # self.assignment = self.argmax_randtie_masking_generic(data, axis = 1) self.assignment = self.argmax_randtie_masking_generic(data, axis=1) clusters = convert_assignment_to_clusters(self.assignment, self.data) self.clusters = clusters _, n = data.shape if k > n: # split each cluster arbitrarily into at least floor(k/l) clusters n_to_split = k // n new_clusters = [] for cluster in clusters: if len(cluster) > n_to_split: splited_arrs = np.array_split(np.array(cluster), n_to_split) new_clusters += splited_arrs while len(new_clusters) < k: len_list = [ len(new_clusters[i]) for i in range(len(new_clusters)) ] max_idx = np.argmax(len_list) splited_arrs = np.array_split(np.array(cluster), n_to_split) new_clusters += splited_arrs self.clusters = new_clusters elif k < n: for i in range(k, len(clusters)): clusters[k - 1] += clusters[i] self.clusters = clusters[:k] self.clusters = np.asarray(self.clusters) impurity = partition_entropy_rg(self.assignment, self.data, self.k)