def gauss_dimensions_mean(dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def gauss_dimensions_cov( dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D - d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([ 1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637 ]) s2 = np.diag(np.concatenate((s2_1, np.ones(D - d)))) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 300 table = np.zeros((num_experiments * len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d))) s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d)))) pi1 = (N - p) / N / 2. pi2 = (N + p) / N / 2. n1, n2 = np.random.multinomial(N, [pi1, pi2]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def normal_or_lognormal_difference( numpoints=range(10, 100, 10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5 * np.eye(D) m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5, 0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart - lloyd) this_res.append(hart - spectral) table.append(this_res) table = np.array(table) return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D-d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637]) s2 = np.diag(np.concatenate((s2_1, np.ones(D-d)))) n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100], num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] n1, n2 = num_points table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 250 table = np.zeros((num_experiments*len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d))) s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d)))) n1 = N-p n2 = N+p X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5,0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2]) rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart-lloyd) this_res.append(hart-spectral) table.append(this_res) table = np.array(table) return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20), num_points=[100, 100], num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] n1, n2 = num_points table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) true_labels = np.delete(true_labels, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0)) / data.std(axis=0) G = energy.eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++") gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans") spectral_labels = cluster.spectral(6, data, G, run_times=10) energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10) lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral") hart_labels = cluster.energy_hartigan(6, data, G, run_times=10, init="spectral") t = PrettyTable([ 'Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 'Fowlkes-Mallows' ]) algos = [ 'kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd', 'energy_hartigan' ]
data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) true_labels = np.delete(true_labels, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0))/data.std(axis=0) G = energy.eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++") gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans") spectral_labels = cluster.spectral(6, data, G, run_times=10) energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10) lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral") hart_labels = cluster.energy_hartigan(6,data,G,run_times=10,init="spectral") t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 'Fowlkes-Mallows']) algos = ['kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd', 'energy_hartigan'] pred_labels = [kmeans_labels, gmm_labels, spectral_labels, energy_spectral_labels, lloyd_labels, hart_labels] for algo, pred_label in zip(algos, pred_labels): t.add_row([algo, energy.metric.accuracy(true_labels, pred_label), sklearn.metrics.adjusted_rand_score(true_labels, pred_label), sklearn.metrics.adjusted_mutual_info_score(true_labels, pred_label),