def compute_gradient(local_X, local_cluster_labels, local_centroids, lr): """ Compute local gradient Input: local_X, local_cluster_labels, local_centroids as above lr - the learning rate (float) Output: local_grad - local gradients as list of k many gradients """ m, n = get_data_dims(local_X) local_grad = [np.zeros([m, n]) for i, e in enumerate(local_centroids)] for x, i in zip(local_X, local_cluster_labels): local_grad[i] += lr * (x - local_centroids[i]) return local_grad
def pp_init(local_X, k): """Do a version of KM++ initialization""" ind = np.random.choice(len(local_X), 1)[0] m, n = get_data_dims(local_X) X_flat = [np.matrix(x.reshape(1, m*n)) for x in local_X] first = X_flat[ind] xcopy = copy.deepcopy(local_X) del X_flat[ind] del xcopy[ind] D = [cdist(x, first, metric='correlation')**2 for x in X_flat] D = np.array(D).flatten() norm = np.sum(D) D = np.array([d/norm for d in D]) remain = [xcopy[i] for i in np.random.choice(len(xcopy), k-1, p=D)] return [local_X[ind]] + remain
def check_stopping(local_centroids, previous_centroids, epsilon): """ Check if centroids have changed beyond some epsilon tolerance Input: local_centroids as above previous_centroids, the centroids from the prior iteration epsilon - the tolerance threshold (float) Output: True if delta is above the threshold, else False """ m, n = get_data_dims(local_centroids) flat_centroids = [np.matrix(w.reshape(1, m*n)) for w in local_centroids] flat_previous = [np.matrix(w.reshape(1, m*n)) for w in previous_centroids] # delta is the change in centroids, computed by distance metric delta = np.sum([cdist(w, flat_previous[k], metric='correlation') for k, w in enumerate(flat_centroids)]) return delta > epsilon, delta
def compute_mean(local_X, local_cluster_labels, k): """ Compute the local mean, which is broadcast back to the aggregator Input: local_X, local_cluster_labels, k as above Output: list of k many local mean matrices, shape m x n """ m, n = get_data_dims(local_X) npinf = np.zeros([m, n]) local_means = [[] for i in range(k)] for i in range(len(local_cluster_labels)): local_means[local_cluster_labels[i]] += [local_X[i]] # Return the origin if no clusters have been assigned to cluster k # !!! is this the way to handle this? return [np.mean(lmean, 0) if lmean else npinf for lmean in local_means]
def compute_clustering(local_X, local_centroids): """ Compute local clustering by associating each data instance with the nearest centroid Input: local_X, centroids as above Output: cluster_labels- a list of N many integers, the labels for each instance """ cluster_labels = [] m, n = get_data_dims(local_X) X_flat = [np.matrix(x.reshape(1, m*n)) for x in local_X] w_flat = [np.matrix(w.reshape(1, m*n)) for w in local_centroids] for x in X_flat: distances = [cdist(x, w, metric='correlation') for w in w_flat] min_index = distances.index(np.min(distances)) cluster_labels.append(min_index) return cluster_labels
def compute_mean(local_X, local_cluster_labels, k): """ Compute the local mean, which is broadcast back to the aggregator Input: local_X, local_cluster_labels, k as above Output: list of k many local mean matrices, shape m x n """ m, n = get_data_dims(local_X) npinf = np.zeros([m, n]) local_means = [np.zeros([m, n]) for i in range(k)] local_counts = [0]*k for i, label in enumerate(local_cluster_labels): local_means[label] += local_X[i] local_counts[label] += 1 # Return the origin if no clusters have been assigned to cluster k # !!! is this the way to handle this? return [lmean/lcount if lcount > 0 else npinf for lmean, lcount in zip(local_means, local_counts)]
def main(X, k, optimization='lloyd', s=2, epsilon=0.00001, shuffle=True, lr=0.001, verbose=True): m, n = get_data_dims(X) nodes, inds = split_over_nodes(X, s, shuffle=shuffle) X = [X[i] for i in inds] # Reshuffle x to match the random tracked_delta = [] num_iter = 0 not_converged = True # Have each site compute k initial clusters locally local_centroids = [ cent for node in nodes for cent in local.initialize_own_centroids(node, k) ] # and select k random clusters from the s*k pool np.random.shuffle(local_centroids) remote_centroids = local_centroids[:k] # Remote Optimization Loop while not_converged: cluster_labels = [None for j in range(s)] # the clusterings local_optimizer = [None for j in range(s)] # the optimization entity # Local computation loop for i, node in enumerate(nodes): # Each site compute local clusters cluster_labels[i] = \ local.compute_clustering(node, remote_centroids) if optimization == 'lloyd': # Lloyd has sites compute means locally local_optimizer[i] = local.compute_mean( node, cluster_labels[i], k) elif optimization == 'gradient': # Gradient descent has sites compute gradients locally local_optimizer[i] = \ local.compute_gradient(node, cluster_labels[i], remote_centroids, lr) # End of Local Computations # Both objects can be aggregated by taking a sum remote_optimizer = remote.aggregate_sum(local_optimizer) if optimization == 'lloyd': # and for the mean, we need to further divide the number of sites remote_optimizer = [r / s for r in remote_optimizer] # Then, update centroids as corresponding to the local mean [remote_centroids, previous] = \ local.mean_step(remote_optimizer, remote_centroids) elif optimization == 'gradient': # Then, update centroids according to one step of gradient descent [remote_centroids, previous] = \ local.gradient_step(remote_optimizer, remote_centroids) # Check the stopping condition "locally" at the aggregator # - returns false if converged remote_check, delta = local.check_stopping(remote_centroids, previous, epsilon) if verbose: print("Multi-Shot %s ; iter : %d delta : %f" % (optimization, num_iter, delta)) not_converged = remote_check tracked_delta.append(delta) num_iter += 1 # Compute the final clustering "locally" at the aggregator cluster_labels = [ clusters for node in nodes for clusters in local.compute_clustering(node, remote_centroids) ] return { 'centroids': remote_centroids, 'cluster_labels': cluster_labels, 'X': X, 'delta': tracked_delta, 'num_iter': i, 'name': 'multishot_%s' % optimization }
def main(X, k, optimization='lloyd', s=2, epsilon=0.00001, shuffle=True, lr=0.01, verbose=True): """ Local Variables - X: a list of N many m x n matrices storing data k: number of clusters (int) local_centroids : a s x k 2-d list of m x n matrices storing cluster centroids """ m, n = get_data_dims(X) nodes, inds = split_over_nodes(X, s, shuffle=shuffle) X = [X[i] for i in inds] # Reshuffle x to match the random tracked_delta = [] num_iter = 0 not_converged = True # Have each site compute k initial clusters locally local_centroids = [local.initialize_own_centroids(node, k) for node in nodes] # Local Optimization Loop while not_converged: cluster_labels = [None for j in range(s)] # the clusterings local_delta = [None for j in range(s)] # Track all local delta local_stop = [False for j in range(s)] # And all local stopping conds for i, node in enumerate(nodes): # Each local site computes its cluster cluster_labels[i] = \ local.compute_clustering(node, local_centroids[i]) if optimization == 'lloyd': # Computes its local mean if doing lloyd, and updates centroids local_means = local.compute_mean(node, cluster_labels[i], k) [local_centroids[i], previous_centroids] = \ local.mean_step(local_means, local_centroids[i]) elif optimization == 'gradient': # Computes the local gradient if doing GD, and takes a GD step local_grad = local.compute_gradient(node, cluster_labels[i], local_centroids[i], lr) [local_centroids[i], previous_centroids] = \ local.gradient_step(local_grad, local_centroids[i]) # Check local stopping conditions local_stop[i], local_delta[i] = \ local.check_stopping(local_centroids[i], previous_centroids, epsilon) num_iter += 1 tracked_delta.append(local_delta) if verbose: print("Single-Shot %s ; iter : %d delta : %f" % (optimization, num_iter, max(local_delta))) # if any of the sites are still iterating, keep the global loop running # TODO: we can save computations by locally waiting if local # conditions are met not_converged = any(local_stop) # Aggregate clusters remotely remote_centroids = remote.aggregate_clusters(local_centroids) # And compute the final global clustering cluster_labels = [clusters for node in nodes for clusters in local.compute_clustering(node, remote_centroids)] return {'centroids': remote_centroids, 'cluster_labels': cluster_labels, 'X': X, 'delta': tracked_delta, 'iter': num_iter, 'name': 'singleshot_%s' % optimization}