z = z - 1 idx = range(len(data)) np.random.shuffle(idx) data = data[idx] z = z[idx] data = (data - data.mean(axis=0))/data.std(axis=0) sigma2 = sum([np.linalg.norm(x-y)**2 for x in data for y in data])/(len(data)**2) sigma = np.sqrt(sigma2) rho_exp = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/(2*sigma)) rho_gauss = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)**2/(2*(sigma)**2)) G = eclust.kernel_matrix(data, rho) #G = eclust.kernel_matrix(data, rho_gauss) #G = eclust.kernel_matrix(data, rho_exp) k = 3 r = [] r.append(wrapper.kmeans(k, data, run_times=5)) r.append(wrapper.gmm(k, data, run_times=5)) r.append(wrapper.spectral_clustering(k, data, G, run_times=5)) r.append(wrapper.spectral(k, data, G, run_times=5)) r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='random')) #r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='k-means++')) #r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='spectral')) r.append(wrapper.kernel_kgroups(k,data,G,run_times=5, ini='random')) #r.append(wrapper.kernel_kgroups(k,data,G,run_times=5, ini='k-means++'))
if __name__ == "__main__": import data import metric from prettytable import PrettyTable import sys n = 400 d = 10 n1, n2 = np.random.multinomial(n, [1/2, 1/2]) m1 = np.zeros(d) m2 = 0.7*np.ones(d) s1 = s2 = np.eye(d) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) G = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x-y)) W = np.eye(n) k = 2 t = PrettyTable(["Method", "Accuracy"]) zh = kernel_kmeans(k, X, G, W, run_times=5, ini="k-means++") a = metric.accuracy(z, zh) t.add_row(["Kernel k-means", a]) zh = kernel_kgroups(k, X, G, W, run_times=5, ini="k-means++") a = metric.accuracy(z, zh) t.add_row(["Kernel k-groups", a]) zh = spectral(k, X, G, W, run_times=5) a = metric.accuracy(z, zh)
def generate_data(D): d = 10 m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((0.7 * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) return X, z r = [] for _ in range(num_experiments): for dim in dimensions: X, z = generate_data(dim) G = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x - y)) zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) r.append(['k-means', dim, a]) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) r.append(['gmm', dim, a]) zh = wrapper.spectral_clustering(k, X, G) a = metric.accuracy(z, zh) r.append(['spectral clustering', dim, a]) zh = wrapper.kernel_kmeans(k, X, G) a = metric.accuracy(z, zh)
s1 = np.array([[1, 0], [0, 20]]) s2 = np.array([[1, 0], [0, 20]]) r1 = 1 r2 = 3 eps = 0.2 r = [] for _ in range(num_experiments): n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) #X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) X, z = data.circles([r1, r2], [eps, eps], [n1, n2]) #G = eclust.kernel_matrix(X, # lambda x, y: 2 - 2*np.exp(-np.linalg.norm(x-y)/2/2)) G = eclust.kernel_matrix( X, lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y)**2 / 2 / 1)) row = [] zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.spectral_clustering(k, X, G) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.kernel_kmeans(k, X, G, ini='random')
m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5, 0.5]) if distr_type == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) elif distr_type == 'lognormal': X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1, n2]) return X, z r = [] for _ in range(num_experiments): for n in num_points: X, z = generate_data(n) G1 = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x-y)) G2 = eclust.kernel_matrix(X, lambda x, y: np.power(np.linalg.norm(x-y), 0.5)) G3 = eclust.kernel_matrix(X, lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)) zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) r.append(['k-means', n, a]) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) r.append(['gmm', n, a]) zh = wrapper.spectral_clustering(k, X, G3) a = metric.accuracy(z, zh)
rho_gauss = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)**2/(2*(1)**2)) #df = pd.read_csv('data/wdbc.data', sep=',', header=None) #df = pd.read_csv('data/iris.data', sep=',', header=None) #classes = { # 'Iris-setosa': 0, # 'Iris-versicolor': 1, # 'Iris-virginica': 2 #} z = np.array([classes[v] for v in df[4].values]) df = df.drop(4, axis=1) data = df.values data = (data - data.mean(axis=0))/data.std(axis=0) G = eclust.kernel_matrix(data, rho_gauss) k = 3 nt = 5 r = [] r.append(wrapper.kmeans(k, data, run_times=nt)) r.append(wrapper.gmm(k, data, run_times=nt)) r.append(wrapper.spectral_clustering(k, data, G, run_times=nt)) r.append(wrapper.spectral(k, data, G, run_times=nt)) r.append(wrapper.kernel_kmeans(k, data, G, run_times=nt, ini='spectral')) r.append(wrapper.kernel_kgroups(k,data,G,run_times=nt, ini='spectral')) t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand']) algos = ['kmeans', 'GMM', 'spectral clustering', 'spectral', 'kernel k-means', 'kernel k-groups']
s1 = 0.5 * np.eye(D) m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5, 0.5]) if distr_type == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) elif distr_type == 'lognormal': X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1, n2]) return X, z r = [] for _ in range(num_experiments): for n in num_points: X, z = generate_data(n) G1 = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x - y)) G2 = eclust.kernel_matrix( X, lambda x, y: np.power(np.linalg.norm(x - y), 0.5)) G3 = eclust.kernel_matrix( X, lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2)) zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) r.append(['k-means', n, a]) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) r.append(['gmm', n, a]) zh = wrapper.spectral_clustering(k, X, G3) a = metric.accuracy(z, zh)
table = np.zeros((num_experiments, 5)) for i in range(num_experiments): X, z = data.univariate_lognormal([0, -1.5], [0.3, 1.5], [100, 100]) #X, z = data.univariate_normal([0, 5], [1, 22], [15, 15]) Y = np.array([[x] for x in X]) k = 2 # 1D energy clustering zh, cost = two_clusters1D(X) table[i,0] = accuracy(z, zh) # initialization z0 = initialization.kmeanspp(k, Y, ret='labels') Z0 = eclust.ztoZ(z0) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(Y, rho) z1 = initialization.spectral(k, G) Z1 = eclust.ztoZ(z1) # Hartigan's method zh = eclust.energy_hartigan(k, G, Z0) table[i,1] = accuracy(z, zh) zh = eclust.energy_hartigan(k, G, Z1) table[i,2] = accuracy(z, zh) # standard k-means km = KMeans(2) zh = km.fit_predict(Y) table[i, 3] = accuracy(z, zh)
idx = np.random.choice(range(len(data)), 2000) data = data[idx] z = z[idx] data = (data - data.mean(axis=0)) / data.std(axis=0) sigma2 = sum([np.linalg.norm(x - y)**2 for x in data for y in data]) / (len(data)**2) sigma = np.sqrt(sigma2) rho_exp = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / (2 * sigma)) rho_gauss = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y)**2 / (2 * (sigma)**2)) # normalize data G = eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) r = [] r.append(wrapper.kmeans(3, data, run_times=5)) r.append(wrapper.gmm(3, data, run_times=5)) r.append(wrapper.spectral_clustering(3, data, G, run_times=5)) r.append(wrapper.spectral(3, data, G, run_times=5)) #r.append(wrapper.kernel_kmeans(3, data, G, run_times=5, ini='random')) #r.append(wrapper.kernel_kmeans(3, data, G, run_times=5, ini='k-means++')) r.append(wrapper.kernel_kmeans(3, data, G, run_times=5, ini='spectral')) #r.append(wrapper.kernel_kgroups(3,data,G,run_times=5, ini='random')) #r.append(wrapper.kernel_kgroups(3,data,G,run_times=5, ini='k-means++')) r.append(wrapper.kernel_kgroups(3, data, G, run_times=5, ini='spectral'))
s1 = np.array([[1,0], [0,20]]) s2 = np.array([[1,0], [0,20]]) r1 = 1 r2 = 3 eps = 0.2 r = [] for _ in range(num_experiments): n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) #X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) X, z = data.circles([r1, r2], [eps, eps], [n1, n2]) #G = eclust.kernel_matrix(X, # lambda x, y: 2 - 2*np.exp(-np.linalg.norm(x-y)/2/2)) G = eclust.kernel_matrix(X, lambda x, y: 2 - 2*np.exp(-np.linalg.norm(x-y)**2/2/1)) row = [] zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.spectral_clustering(k, X, G) a = metric.accuracy(z, zh) row.append(a) zh = wrapper.kernel_kmeans(k, X, G, ini='random')