def energy1D(X, z): """Energy clustering in 1 dimension. No need to run multiple times since this is exact. """ zh, cost = energy1d.two_clusters1D(X) return metric.accuracy(z, zh)
def kmeans(k, X, z, run_times=10, init='k-means++'): """run_times is the number of times the algorithm is gonna run. init = {'k-means++', 'random'} """ km = KMeans(k, n_init=run_times, init=init) km.fit(X) zh = km.labels_ a = metric.accuracy(z, zh) v = metric.variation_information(z, zh) return a, v
def spectral(k, X, G, z, run_times=10): """Spectral clustering from sklearn library. run_times is the number of times the algorithm is gonna run with different initializations. """ sc = SpectralClustering(k, affinity='precomputed', n_init=run_times) zh = sc.fit_predict(G) a = metric.accuracy(z, zh) v = metric.variation_information(z, zh) return a, v
def gmm(k, X, z, run_times=10, init='kmeans'): """GMM from sklearn library. init = {'kmeans', 'random'}, run_times is the number of times the algorithm is gonna run with different initializations. """ gm = GMM(k, n_init=run_times, init_params=init) gm.fit(X) zh = gm.predict(X) a = metric.accuracy(z, zh) v = metric.variation_information(z, zh) return a, v
def energy_hartigan(k, X, G, z, run_times=10, init="spectral"): """Run few times and pick the best objective function value.""" best_score = -np.inf for rt in range(run_times): Z0 = initialize(init, k, G, X) zh = eclust.energy_hartigan(k, G, Z0, max_iter=300) Zh = eclust.ztoZ(zh) score = eclust.objective(Zh, G) if score > best_score: best_score = score best_z = zh a = metric.accuracy(z, best_z) v = metric.variation_information(z, best_z) return a, v
def energy_spectral(k, X, G, z, run_times=10, init="random"): """Run few times and pick the best objective function value. Choose the initializatio for k-means, which can be k-means++ or random. """ best_score = -np.inf for rt in range(run_times): zh = initialization.topeigen(k, G, run_times=run_times, init="random") Zh = eclust.ztoZ(zh) score = eclust.objective(Zh, G) if score > best_score: best_score = score best_z = zh a = metric.accuracy(z, best_z) v = metric.variation_information(z, best_z) return a, v
n0 = 500 n1 = 500 data_class0 = data[np.where(labels == 0)] data_class1 = data[np.where(labels == 1)] idx0 = np.random.choice(range(len(data_class0)), n0, replace=True) idx1 = np.random.choice(range(len(data_class1)), n1, replace=True) data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]]) #data = (data - data.mean(axis=0))/data.std(axis=0) rho = lambda x, y: np.power(np.linalg.norm(x - y), 1) #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2)) G = eclust.kernel_matrix(data, rho) labels_hat = run_clustering.kmeans(2, data) print accuracy(labels, labels_hat) print type_errors(labels, labels_hat) print labels_hat = run_clustering.gmm(2, data) print accuracy(labels, labels_hat) print type_errors(labels, labels_hat) print labels_hat = run_clustering.energy_hartigan(2, data, G, run_times=5, init="gmm") print accuracy(labels, labels_hat) print type_errors(labels, labels_hat)
n0 = 500 n1 = 500 data_class0 = data[np.where(labels==0)] data_class1 = data[np.where(labels==1)] idx0 = np.random.choice(range(len(data_class0)), n0, replace=True) idx1 = np.random.choice(range(len(data_class1)), n1, replace=True) data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]]) #data = (data - data.mean(axis=0))/data.std(axis=0) rho = lambda x, y: np.power(np.linalg.norm(x-y), 1) #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2)) G = eclust.kernel_matrix(data, rho) labels_hat = run_clustering.kmeans(2, data) print accuracy(labels, labels_hat) print type_errors(labels, labels_hat) print labels_hat = run_clustering.gmm(2, data) print accuracy(labels, labels_hat) print type_errors(labels, labels_hat) print labels_hat = run_clustering.energy_hartigan(2, data, G, run_times=5, init="gmm") print accuracy(labels, labels_hat) print type_errors(labels, labels_hat) print