def gauss_dimensions_mean(dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def cigars_circles(num_experiments=10, run_times=5, kind='cigars'): table = [] for i in range(num_experiments): this_experiment = [] if kind == 'cigars': m1 = [0,0] m2 = [6.5,0] s1 = np.array([[1,0],[0,20]]) s2 = np.array([[1,0],[0,20]]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200]) k = 2 init = 'k-means++' elif kind == '2circles': X, z = data.circles([1, 3], [0.2, 0.2], [400, 400]) k = 2 init = 'random' elif kind == '3circles': X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400]) init = 'random' k = 3 else: raise ValueError("Don't know which example to sample.") #sigma = 2 sigma = 1 G = eclust.kernel_matrix(X, rho_standard) G_half = eclust.kernel_matrix(X, rho_half) G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma)) G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma)) this_experiment.append( run_clustering.energy_hartigan(k,X,G,z,init=init, run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_half,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_exp,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_rbf,z, init=init,run_times=run_times)) #this_experiment.append( # run_clustering.spectral(k,X,G_exp,z,run_times=run_times)) this_experiment.append( run_clustering.spectral(k,X,G_rbf,z,run_times=run_times)) this_experiment.append( run_clustering.kmeans(k,X,z,init=init,run_times=run_times)) this_experiment.append( run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times)) this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X))) table.append(this_experiment) table = np.array(table) for i in range(8): print table[:,i].mean(), scipy.stats.sem(table[:,i])
def mnist(num_experiments=10, digits=[0,1,2], num_points=100, run_times=5): k = len(digits) f = gzip.open('experiments_data/mnist.pkl.gz', 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() images_test, labels_test = train_set images, labels = valid_set # using training test to compute sigma X_train, z_train = sample_digits(digits, images_test, labels_test, num_points) n, _ = X_train.shape sigma = np.sqrt(sum([np.linalg.norm(X_train[i]-X_train[j])**2 for i in range(n) for j in range(n)])/(n**2)) print sigma print table = [] init = 'k-means++' for i in range(num_experiments): this_experiment = [] # now cluster on validation set X, z = sample_digits(digits, images, labels, num_points) G = eclust.kernel_matrix(X, rho_standard) G_half = eclust.kernel_matrix(X, rho_half) G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma)) G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma)) this_experiment.append( run_clustering.energy_hartigan(k,X,G,z,init=init, run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_half,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_exp,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_rbf,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.spectral(k,X,G_rbf,z, run_times=run_times)) this_experiment.append( run_clustering.kmeans(k,X,z,init="k-means++",run_times=run_times)) this_experiment.append( run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times)) # my gmm was breaking for some unknown reason #this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X))) table.append(this_experiment) table = np.array(table) for i in range(table[0,:].shape[0]): print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_cov( dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D - d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([ 1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637 ]) s2 = np.diag(np.concatenate((s2_1, np.ones(D - d)))) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 300 table = np.zeros((num_experiments * len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d))) s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d)))) pi1 = (N - p) / N / 2. pi2 = (N + p) / N / 2. n1, n2 = np.random.multinomial(N, [pi1, pi2]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def normal_or_lognormal_difference( numpoints=range(10, 100, 10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5 * np.eye(D) m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5, 0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart - lloyd) this_res.append(hart - spectral) table.append(this_res) table = np.array(table) return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D-d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637]) s2 = np.diag(np.concatenate((s2_1, np.ones(D-d)))) n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 300 table = np.zeros((num_experiments*len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d))) s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d)))) pi1 = (N-p)/N/2. pi2 = (N+p)/N/2. n1, n2 = np.random.multinomial(N, [pi1,pi2]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5,0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2]) rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart-lloyd) this_res.append(hart-spectral) table.append(this_res) table = np.array(table) return table
# delete missing entries delete_missing = np.where(data == '?')[0] data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) true_labels = np.delete(true_labels, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0)) / data.std(axis=0) G = energy.eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++") gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans") spectral_labels = cluster.spectral(6, data, G, run_times=10) energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10) lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral") hart_labels = cluster.energy_hartigan(6, data, G, run_times=10, init="spectral") t = PrettyTable([ 'Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 'Fowlkes-Mallows' ]) algos = [ 'kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
# delete missing entries delete_missing = np.where(data=='?')[0] data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) true_labels = np.delete(true_labels, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0))/data.std(axis=0) G = energy.eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++") gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans") spectral_labels = cluster.spectral(6, data, G, run_times=10) energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10) lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral") hart_labels = cluster.energy_hartigan(6,data,G,run_times=10,init="spectral") t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 'Fowlkes-Mallows']) algos = ['kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd', 'energy_hartigan'] pred_labels = [kmeans_labels, gmm_labels, spectral_labels, energy_spectral_labels, lloyd_labels, hart_labels] for algo, pred_label in zip(algos, pred_labels): t.add_row([algo, energy.metric.accuracy(true_labels, pred_label),