def test_initialise(): X = [[1, 2, 3], [4, 5, 6]] M = [[0, 1, 1], [1, 0, 1]] K = 2 seed = 0 kmeans = KMeans(X, M, K) kmeans.initialise(seed) mins = [4.0, 2.0, 3.0] maxs = [4.0, 2.0, 6.0] assert numpy.array_equal(mins, kmeans.mins) assert numpy.array_equal(maxs, kmeans.maxs) mask_centroids = [[1, 1, 1], [1, 1, 1]] assert numpy.array_equal(mask_centroids, kmeans.mask_centroids) cluster_assignments = [-1, -1] assert numpy.array_equal(cluster_assignments, kmeans.cluster_assignments) centroids = [[4.0, 2.0, 4.2617147424925346], [4.0, 2.0, 4.2148024123512426]] assert numpy.array_equal(centroids, kmeans.centroids) distances = [0, 0] assert numpy.array_equal(distances, kmeans.distances)
def test_assignment(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Test change - new closest clusters are [0,0,1] - see test_closest_cluster centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] cluster_assignments = [0, 1, 1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == True assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments) assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments) # Test no change centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] cluster_assignments = [0, 0, 1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == False assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments) assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments)
def test_initialise(): X = [[1,2,3],[4,5,6]] M = [[0,1,1],[1,0,1]] K = 2 seed = 0 kmeans = KMeans(X,M,K) kmeans.initialise(seed) mins = [4.0,2.0,3.0] maxs = [4.0,2.0,6.0] assert numpy.array_equal(mins,kmeans.mins) assert numpy.array_equal(maxs,kmeans.maxs) mask_centroids = [[1,1,1],[1,1,1]] assert numpy.array_equal(mask_centroids,kmeans.mask_centroids) cluster_assignments = [-1,-1] assert numpy.array_equal(cluster_assignments,kmeans.cluster_assignments) centroids = [[4.0,2.0,4.2617147424925346],[4.0,2.0,4.2148024123512426]] assert numpy.array_equal(centroids,kmeans.centroids) distances = [0,0] assert numpy.array_equal(distances,kmeans.distances)
def initialise(self,init_S='random',init_FG='random'): assert init_S in ['random','exp'], "Unknown initialisation option for S: %s. Should be 'random' or 'exp'." % init_S assert init_FG in ['random','exp','kmeans'], "Unknown initialisation option for S: %s. Should be 'random', 'exp', or 'kmeans." % init_FG self.S = 1./self.lambdaS if init_S == 'random': for k,l in itertools.product(xrange(0,self.K),xrange(0,self.L)): self.S[k,l] = exponential_draw(self.lambdaS[k,l]) self.F, self.G = 1./self.lambdaF, 1./self.lambdaG if init_FG == 'random': for i,k in itertools.product(xrange(0,self.I),xrange(0,self.K)): self.F[i,k] = exponential_draw(self.lambdaF[i,k]) for j,l in itertools.product(xrange(0,self.J),xrange(0,self.L)): self.G[j,l] = exponential_draw(self.lambdaG[j,l]) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R,self.M,self.K) kmeans_F.initialise() kmeans_F.cluster() self.F = kmeans_F.clustering_results + 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T,self.M.T,self.L) kmeans_G.initialise() kmeans_G.cluster() self.G = kmeans_G.clustering_results + 0.2 self.tau = gamma_mode(self.alpha_s(), self.beta_s())
def test_assignment(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Test change - new closest clusters are [0,0,1] - see test_closest_cluster centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] cluster_assignments = [0,1,1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == True assert numpy.array_equal([0,0,1],kmeans.cluster_assignments) assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments) # Test no change centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] cluster_assignments = [0,0,1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == False assert numpy.array_equal([0,0,1],kmeans.cluster_assignments) assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments)
def test_create_matrix(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) kmeans.cluster_assignments = numpy.array([1, 0, 1]) kmeans.create_matrix() expected_clustering_results = [[0, 1], [1, 0], [0, 1]] clustering_results = kmeans.clustering_results assert numpy.array_equal(expected_clustering_results, clustering_results)
def test_create_matrix(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) kmeans.cluster_assignments = numpy.array([1,0,1]) kmeans.create_matrix() expected_clustering_results = [[0,1],[1,0],[0,1]] clustering_results = kmeans.clustering_results assert numpy.array_equal(expected_clustering_results,clustering_results)
def test_random_cluster_centroid(): X = [[1,2,3],[4,5,6]] M = [[0,1,1],[1,0,1]] K = 2 kmeans = KMeans(X,M,K) kmeans.mins = [4.0,2.0,3.0] kmeans.maxs = [4.0,2.0,6.0] expected_centroid = [4.0,2.0,4.2617147424925346] random.seed(0) centroid = kmeans.random_cluster_centroid() assert numpy.array_equal(expected_centroid,centroid)
def test_random_cluster_centroid(): X = [[1, 2, 3], [4, 5, 6]] M = [[0, 1, 1], [1, 0, 1]] K = 2 kmeans = KMeans(X, M, K) kmeans.mins = [4.0, 2.0, 3.0] kmeans.maxs = [4.0, 2.0, 6.0] expected_centroid = [4.0, 2.0, 4.2617147424925346] random.seed(0) centroid = kmeans.random_cluster_centroid() assert numpy.array_equal(expected_centroid, centroid)
def initialise(self,init_S='random',init_FG='random',tauFSG={}): self.tauF = tauFSG['tauF'] if 'tauF' in tauFSG else numpy.ones((self.I,self.K)) self.tauS = tauFSG['tauS'] if 'tauS' in tauFSG else numpy.ones((self.K,self.L)) self.tauG = tauFSG['tauG'] if 'tauG' in tauFSG else numpy.ones((self.J,self.L)) assert init_S in ['exp','random'], "Unrecognised init option for S: %s." % init_S self.muS = 1./self.lambdaS if init_S == 'random': for k,l in itertools.product(xrange(0,self.K),xrange(0,self.L)): self.muS[k,l] = exponential_draw(self.lambdaS[k,l]) assert init_FG in ['exp','random','kmeans'], "Unrecognised init option for F,G: %s." % init_FG self.muF, self.muG = 1./self.lambdaF, 1./self.lambdaG if init_FG == 'random': for i,k in itertools.product(xrange(0,self.I),xrange(0,self.K)): self.muF[i,k] = exponential_draw(self.lambdaF[i,k]) for j,l in itertools.product(xrange(0,self.J),xrange(0,self.L)): self.muG[j,l] = exponential_draw(self.lambdaG[j,l]) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R,self.M,self.K) kmeans_F.initialise() kmeans_F.cluster() self.muF = kmeans_F.clustering_results #+ 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T,self.M.T,self.L) kmeans_G.initialise() kmeans_G.cluster() self.muG = kmeans_G.clustering_results #+ 0.2 # Initialise the expectations and variances self.expF, self.varF = numpy.zeros((self.I,self.K)), numpy.zeros((self.I,self.K)) self.expS, self.varS = numpy.zeros((self.K,self.L)), numpy.zeros((self.K,self.L)) self.expG, self.varG = numpy.zeros((self.J,self.L)), numpy.zeros((self.J,self.L)) for k in range(0,self.K): self.update_exp_F(k) for k,l in itertools.product(xrange(0,self.K),xrange(0,self.L)): self.update_exp_S(k,l) for l in range(0,self.L): self.update_exp_G(l) # Initialise tau using the updates self.update_tau() #self.alpha_s, self.beta_s = self.alpha, self.beta self.update_exp_tau()
def test_find_known_coordinate_values(): # Normal test case X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) kmeans.data_point_assignments = numpy.array([[0,1],[2]]) #points 0,1 to cluster 0, point 2 to cluster 1 expected_lists_known_coordinate_values_0 = [[1.0],[2.0,5.0],[]] expected_lists_known_coordinate_values_1 = [[7.0],[8.0],[9.0]] lists_known_coordinate_values_0 = kmeans.find_known_coordinate_values(0) lists_known_coordinate_values_1 = kmeans.find_known_coordinate_values(1) assert numpy.array_equal(expected_lists_known_coordinate_values_0,lists_known_coordinate_values_0) assert numpy.array_equal(expected_lists_known_coordinate_values_1,lists_known_coordinate_values_1) # Cluster without any points X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) kmeans.data_point_assignments = numpy.array([[0,1,2],[]]) #points 0,1,2 to cluster 0, none to cluster 1 expected_lists_known_coordinate_values_0 = [[1.0,7.0],[2.0,5.0,8.0],[9.0]] expected_lists_known_coordinate_values_1 = None lists_known_coordinate_values_0 = kmeans.find_known_coordinate_values(0) lists_known_coordinate_values_1 = kmeans.find_known_coordinate_values(1) assert numpy.array_equal(expected_lists_known_coordinate_values_0,lists_known_coordinate_values_0) assert numpy.array_equal(expected_lists_known_coordinate_values_1,lists_known_coordinate_values_1)
def test_compute_MSE(): # Test case: no overlap X = numpy.ones((1, 5)) M = numpy.ones((1, 5)) K = 1 x1 = [1.0, 2.0, 3.0, 4.0, 5.0] x2 = [5.0, 4.5, 3.0, 2.5, 1.0] mask1 = [0, 1, 1, 0, 0] mask2 = [1, 0, 0, 0, 1] kmeans = KMeans(X, M, K) output = kmeans.compute_MSE(x1, x2, mask1, mask2) assert output == None # Overlap mask1 = [1, 1, 1, 0, 1] mask2 = [0, 1, 1, 1, 1] expected_output = (2.5**2 + 4.0**2) / 3.0 output = kmeans.compute_MSE(x1, x2, mask1, mask2) assert expected_output == output
def test_compute_MSE(): # Test case: no overlap X = numpy.ones((1,5)) M = numpy.ones((1,5)) K = 1 x1 = [1.0,2.0,3.0,4.0,5.0] x2 = [5.0,4.5,3.0,2.5,1.0] mask1 = [0,1,1,0,0] mask2 = [1,0,0,0,1] kmeans = KMeans(X,M,K) output = kmeans.compute_MSE(x1,x2,mask1,mask2) assert output == None # Overlap mask1 = [1,1,1,0,1] mask2 = [0,1,1,1,1] expected_output = ( 2.5**2 + 4.0**2 ) / 3.0 output = kmeans.compute_MSE(x1,x2,mask1,mask2) assert expected_output == output
def test_find_point_furthest_away(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Equal distance for point 0 centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.closest_cluster(X[0], 0, M[0]) # MSE = 1.0 vs 1.0 kmeans.closest_cluster(X[1], 1, M[1]) # MSE = 4.0 vs 16.0 kmeans.closest_cluster(X[2], 2, M[2]) # MSE = 44.5 vs 37.0 expected_furthest_away = 2 furthest_away = kmeans.find_point_furthest_away() assert expected_furthest_away == furthest_away
def initialise(self,init_S='random',init_FG='random',expo_prior=1.): assert init_S in ['ones','random','exponential'], "Unrecognised init option for S: %s." % init_S assert init_FG in ['ones','random','exponential','kmeans'], "Unrecognised init option for F,G: %s." % init_FG if init_S == 'ones': self.S = numpy.ones((self.K,self.L)) elif init_S == 'random': self.S = numpy.random.rand(self.K,self.L) elif init_S == 'exponential': self.S = numpy.empty((self.K,self.L)) for k,l in itertools.product(xrange(0,self.K),xrange(0,self.L)): self.S[k,l] = exponential_draw(expo_prior) if init_FG == 'ones': self.F = numpy.ones((self.I,self.K)) self.G = numpy.ones((self.J,self.L)) elif init_FG == 'random': self.F = numpy.random.rand(self.I,self.K) self.G = numpy.random.rand(self.J,self.L) elif init_FG == 'exponential': self.F = numpy.empty((self.I,self.K)) self.G = numpy.empty((self.J,self.L)) for i,k in itertools.product(xrange(0,self.I),xrange(0,self.K)): self.F[i,k] = exponential_draw(expo_prior) for j,l in itertools.product(xrange(0,self.J),xrange(0,self.L)): self.G[j,l] = exponential_draw(expo_prior) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R,self.M,self.K) kmeans_F.initialise() kmeans_F.cluster() self.F = kmeans_F.clustering_results + 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T,self.M.T,self.L) kmeans_G.initialise() kmeans_G.cluster() self.G = kmeans_G.clustering_results + 0.2
def test_find_point_furthest_away(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Equal distance for point 0 centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.closest_cluster(X[0],0,M[0]) # MSE = 1.0 vs 1.0 kmeans.closest_cluster(X[1],1,M[1]) # MSE = 4.0 vs 16.0 kmeans.closest_cluster(X[2],2,M[2]) # MSE = 44.5 vs 37.0 expected_furthest_away = 2 furthest_away = kmeans.find_point_furthest_away() assert expected_furthest_away == furthest_away
def test_closest_cluster(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Equal distance for point 0 centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster_0 = 0 # MSE = 1.0 vs 1.0 expected_closest_cluster_1 = 0 # MSE = 4.0 vs 16.0 expected_closest_cluster_2 = 1 # MSE = 44.5 vs 37.0 closest_cluster_0 = kmeans.closest_cluster(X[0], 0, M[0]) closest_cluster_1 = kmeans.closest_cluster(X[1], 1, M[1]) closest_cluster_2 = kmeans.closest_cluster(X[2], 2, M[2]) assert expected_closest_cluster_0 == closest_cluster_0 assert expected_closest_cluster_1 == closest_cluster_1 assert expected_closest_cluster_2 == closest_cluster_2 # Also test whether the distances are set correctly expected_distances = [1.0, 4.0, 37.0] distances = kmeans.distances assert numpy.array_equal(expected_distances, distances) # Test when all MSEs return None (impossible but still testing behaviour) centroids = numpy.ones((2, 3)) mask_centroids = [[0, 0, 1], [0, 0, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster = 1 closest_cluster = kmeans.closest_cluster(X[0], 0, M[0]) assert expected_closest_cluster == closest_cluster
def test_closest_cluster(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Equal distance for point 0 centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster_0 = 0 # MSE = 1.0 vs 1.0 expected_closest_cluster_1 = 0 # MSE = 4.0 vs 16.0 expected_closest_cluster_2 = 1 # MSE = 44.5 vs 37.0 closest_cluster_0 = kmeans.closest_cluster(X[0],0,M[0]) closest_cluster_1 = kmeans.closest_cluster(X[1],1,M[1]) closest_cluster_2 = kmeans.closest_cluster(X[2],2,M[2]) assert expected_closest_cluster_0 == closest_cluster_0 assert expected_closest_cluster_1 == closest_cluster_1 assert expected_closest_cluster_2 == closest_cluster_2 # Also test whether the distances are set correctly expected_distances = [1.0,4.0,37.0] distances = kmeans.distances assert numpy.array_equal(expected_distances,distances) # Test when all MSEs return None (impossible but still testing behaviour) centroids = numpy.ones((2,3)) mask_centroids = [[0,0,1],[0,0,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster = 1 closest_cluster = kmeans.closest_cluster(X[0],0,M[0]) assert expected_closest_cluster == closest_cluster
def test_find_known_coordinate_values(): # Normal test case X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) kmeans.data_point_assignments = numpy.array( [[0, 1], [2]]) #points 0,1 to cluster 0, point 2 to cluster 1 expected_lists_known_coordinate_values_0 = [[1.0], [2.0, 5.0], []] expected_lists_known_coordinate_values_1 = [[7.0], [8.0], [9.0]] lists_known_coordinate_values_0 = kmeans.find_known_coordinate_values(0) lists_known_coordinate_values_1 = kmeans.find_known_coordinate_values(1) assert numpy.array_equal(expected_lists_known_coordinate_values_0, lists_known_coordinate_values_0) assert numpy.array_equal(expected_lists_known_coordinate_values_1, lists_known_coordinate_values_1) # Cluster without any points X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) kmeans.data_point_assignments = numpy.array( [[0, 1, 2], []]) #points 0,1,2 to cluster 0, none to cluster 1 expected_lists_known_coordinate_values_0 = [[1.0, 7.0], [2.0, 5.0, 8.0], [9.0]] expected_lists_known_coordinate_values_1 = None lists_known_coordinate_values_0 = kmeans.find_known_coordinate_values(0) lists_known_coordinate_values_1 = kmeans.find_known_coordinate_values(1) assert numpy.array_equal(expected_lists_known_coordinate_values_0, lists_known_coordinate_values_0) assert numpy.array_equal(expected_lists_known_coordinate_values_1, lists_known_coordinate_values_1)
def initialise(self, init_S='random', init_FG='random', tauFSG={}): self.tauF = tauFSG['tauF'] if 'tauF' in tauFSG else numpy.ones( (self.I, self.K)) self.tauS = tauFSG['tauS'] if 'tauS' in tauFSG else numpy.ones( (self.K, self.L)) self.tauG = tauFSG['tauG'] if 'tauG' in tauFSG else numpy.ones( (self.J, self.L)) assert init_S in ['exp', 'random' ], "Unrecognised init option for S: %s." % init_S self.muS = 1. / self.lambdaS if init_S == 'random': for k, l in itertools.product(xrange(0, self.K), xrange(0, self.L)): self.muS[k, l] = exponential_draw(self.lambdaS[k, l]) assert init_FG in ['exp', 'random', 'kmeans' ], "Unrecognised init option for F,G: %s." % init_FG self.muF, self.muG = 1. / self.lambdaF, 1. / self.lambdaG if init_FG == 'random': for i, k in itertools.product(xrange(0, self.I), xrange(0, self.K)): self.muF[i, k] = exponential_draw(self.lambdaF[i, k]) for j, l in itertools.product(xrange(0, self.J), xrange(0, self.L)): self.muG[j, l] = exponential_draw(self.lambdaG[j, l]) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R, self.M, self.K) kmeans_F.initialise() kmeans_F.cluster() self.muF = kmeans_F.clustering_results #+ 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T, self.M.T, self.L) kmeans_G.initialise() kmeans_G.cluster() self.muG = kmeans_G.clustering_results #+ 0.2 # Initialise the expectations and variances self.expF, self.varF = numpy.zeros((self.I, self.K)), numpy.zeros( (self.I, self.K)) self.expS, self.varS = numpy.zeros((self.K, self.L)), numpy.zeros( (self.K, self.L)) self.expG, self.varG = numpy.zeros((self.J, self.L)), numpy.zeros( (self.J, self.L)) for k in range(0, self.K): self.update_exp_F(k) for k, l in itertools.product(xrange(0, self.K), xrange(0, self.L)): self.update_exp_S(k, l) for l in range(0, self.L): self.update_exp_G(l) # Initialise tau using the updates self.update_tau() #self.alpha_s, self.beta_s = self.alpha, self.beta self.update_exp_tau()
def initialise(self, init_S='random', init_FG='random'): assert init_S in [ 'random', 'exp' ], "Unknown initialisation option for S: %s. Should be 'random' or 'exp'." % init_S assert init_FG in [ 'random', 'exp', 'kmeans' ], "Unknown initialisation option for S: %s. Should be 'random', 'exp', or 'kmeans." % init_FG self.S = 1. / self.lambdaS if init_S == 'random': for k, l in itertools.product(xrange(0, self.K), xrange(0, self.L)): self.S[k, l] = exponential_draw(self.lambdaS[k, l]) self.F, self.G = 1. / self.lambdaF, 1. / self.lambdaG if init_FG == 'random': for i, k in itertools.product(xrange(0, self.I), xrange(0, self.K)): self.F[i, k] = exponential_draw(self.lambdaF[i, k]) for j, l in itertools.product(xrange(0, self.J), xrange(0, self.L)): self.G[j, l] = exponential_draw(self.lambdaG[j, l]) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R, self.M, self.K) kmeans_F.initialise() kmeans_F.cluster() self.F = kmeans_F.clustering_results + 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T, self.M.T, self.L) kmeans_G.initialise() kmeans_G.cluster() self.G = kmeans_G.clustering_results + 0.2 self.tau = gamma_mode(self.alpha_s(), self.beta_s())
def cluster(R,M,K): kmeans = KMeans(R,M,K) kmeans.initialise() kmeans.cluster() return kmeans.clustering_results
def test_init(): # Test getting an exception when X and M are different sizes, X is not a 2D array, and K <= 0 X1 = numpy.ones(3) M = numpy.ones((2, 3)) K = 0 with pytest.raises(AssertionError) as error: KMeans(X1, M, K) assert str( error.value ) == "Input matrix X is not a two-dimensional array, but instead 1-dimensional." X2 = numpy.ones((4, 3, 2)) with pytest.raises(AssertionError) as error: KMeans(X2, M, K) assert str( error.value ) == "Input matrix X is not a two-dimensional array, but instead 3-dimensional." X3 = numpy.ones((3, 2)) with pytest.raises(AssertionError) as error: KMeans(X3, M, K) assert str( error.value ) == "Input matrix X is not of the same size as the indicator matrix M: (3, 2) and (2, 3) respectively." X4 = numpy.ones((2, 3)) K1 = 0 with pytest.raises(AssertionError) as error: KMeans(X4, M, K1) assert str(error.value) == "K should be greater than 0." # Test getting an exception if a row or column is entirely unknown X = numpy.ones((2, 3)) M1 = [[1, 1, 1], [0, 0, 0]] M2 = [[1, 1, 0], [1, 0, 0]] K = 1 with pytest.raises(AssertionError) as error: KMeans(X, M1, K) assert str(error.value) == "Fully unobserved row in X, row 1." with pytest.raises(AssertionError) as error: KMeans(X, M2, K) assert str(error.value) == "Fully unobserved column in X, column 2." # Test completely observed case X = numpy.ones((2, 3)) M = numpy.ones((2, 3)) omega_rows = [[0, 1, 2], [0, 1, 2]] omega_columns = [[0, 1], [0, 1], [0, 1]] kmeans = KMeans(X, M, K) assert numpy.array_equal(omega_rows, kmeans.omega_rows) assert numpy.array_equal(omega_columns, kmeans.omega_columns) assert kmeans.no_points == 2 assert kmeans.no_coordinates == 3 # Test partially observed case M = [[1, 0, 1], [0, 1, 1]] omega_rows = [[0, 2], [1, 2]] omega_columns = [[0], [1], [0, 1]] kmeans = KMeans(X, M, K) assert numpy.array_equal(omega_rows, kmeans.omega_rows) assert numpy.array_equal(omega_columns, kmeans.omega_columns)
def test_cluster(): ### No missing values case. # Points 1,2 will first go to cluster 2, and point 3 to cluster 1. # Then point 1 will switch to cluster 1. X = [[2, 5], [7, 5], [2, 3]] M = numpy.ones((3, 2)) K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 2.0], [4.0, 5.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1] expected_centroids = [[2.0, 4.0], [7.0, 5.0]] expected_cluster_assignments = [0, 1, 0] expected_data_point_assignments = [[0, 2], [1]] expected_clustering_results = [[1, 0], [0, 1], [1, 0]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results) ### Missing values case. # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1. # Then point 2 will switch to cluster 1. X = [[2, 5], [3, -1], [10, 1], [-1, 2]] M = [[1, 1], [1, 0], [1, 1], [0, 1]] K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 7.0], [3.0, 2.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1, -1] expected_centroids = [[2.5, 5.0], [10.0, 1.5]] expected_cluster_assignments = [0, 0, 1, 1] expected_data_point_assignments = [[0, 1], [2, 3]] expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results) ### Cluster with 0 coordinate. # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4. X = [[2, 5], [3, -1], [-1, 1], [-1, 2]] M = [[1, 1], [1, 0], [0, 1], [0, 1]] K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 7.0], [4.0, 4.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1, -1] expected_centroids = [[2.5, 5.0], [0, 1.5]] expected_cluster_assignments = [0, 0, 1, 1] expected_data_point_assignments = [[0, 1], [2, 3]] expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results)
def test_update(): # Normal case X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) kmeans.data_point_assignments = numpy.array( [[0, 1], [2]]) #points 0,1 to cluster 0, point 2 to cluster 1 kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] new_centroids = [[1.0, 3.5, 0], [7.0, 8.0, 9.0]] new_mask_centroids = [[1, 1, 0], [1, 1, 1]] kmeans.update() assert numpy.array_equal(new_centroids, kmeans.centroids) assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids) # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K, 'random') kmeans.data_point_assignments = numpy.array( [[0, 1, 2], []]) #points 0,1,2 to cluster 0, none to cluster 1 kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mins = [1.0, 2.0, 9.0] kmeans.maxs = [7.0, 8.0, 9.0] new_centroids = [[4.0, 5.0, 9.0], [6.066531109150288, 6.547726417641815, 9.0]] new_mask_centroids = [[1, 1, 1], [1, 1, 1]] random.seed(0) kmeans.update() assert numpy.array_equal(new_centroids, kmeans.centroids) assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids) # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2. # Point 2 is furthest away, so gets reassigned to cluster 2 - making # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1 X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 3 kmeans = KMeans(X, M, K, resolve_empty='singleton') kmeans.data_point_assignments = numpy.array( [[0, 1], [2], []]) #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2 kmeans.cluster_assignments = [0, 0, 1] kmeans.centroids = [[1.0, 2.0, 3.0], [15.0, 16.0, 17.0], [500.0, 500.0, 500.0]] kmeans.mask_centroids = [[1, 1, 0], [1, 1, 1], [1, 1, 1]] kmeans.distances = numpy.array([ kmeans.compute_MSE(kmeans.X[0], kmeans.centroids[0], M[0], kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[1], kmeans.centroids[0], M[1], kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[2], kmeans.centroids[1], M[2], kmeans.mask_centroids[1]) ]) kmeans.mins = [1.0, 2.0, 9.0] kmeans.maxs = [7.0, 8.0, 9.0] new_centroids = [[1.0, 2.0, 0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] new_data_point_assignments = [[0], [1], [2]] new_distances = [0, 0, 0] kmeans.update() assert new_data_point_assignments == list(kmeans.data_point_assignments) assert numpy.array_equal(new_distances, kmeans.distances) assert numpy.array_equal(new_centroids, kmeans.centroids)
def test_update(): # Normal case X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) kmeans.data_point_assignments = numpy.array([[0,1],[2]]) #points 0,1 to cluster 0, point 2 to cluster 1 kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] new_centroids = [[1.0,3.5,0],[7.0,8.0,9.0]] new_mask_centroids = [[1,1,0],[1,1,1]] kmeans.update() assert numpy.array_equal(new_centroids,kmeans.centroids) assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids) # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K,'random') kmeans.data_point_assignments = numpy.array([[0,1,2],[]]) #points 0,1,2 to cluster 0, none to cluster 1 kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mins = [1.0,2.0,9.0] kmeans.maxs = [7.0,8.0,9.0] new_centroids = [[4.0,5.0,9.0],[6.066531109150288,6.547726417641815,9.0]] new_mask_centroids = [[1,1,1],[1,1,1]] random.seed(0) kmeans.update() assert numpy.array_equal(new_centroids,kmeans.centroids) assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids) # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2. # Point 2 is furthest away, so gets reassigned to cluster 2 - making # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1 X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 3 kmeans = KMeans(X,M,K,resolve_empty='singleton') kmeans.data_point_assignments = numpy.array([[0,1],[2],[]]) #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2 kmeans.cluster_assignments = [0,0,1] kmeans.centroids = [[1.0,2.0,3.0],[15.0,16.0,17.0],[500.0,500.0,500.0]] kmeans.mask_centroids = [[1,1,0],[1,1,1],[1,1,1]] kmeans.distances = numpy.array([ kmeans.compute_MSE(kmeans.X[0],kmeans.centroids[0],M[0],kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[1],kmeans.centroids[0],M[1],kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[2],kmeans.centroids[1],M[2],kmeans.mask_centroids[1]) ]) kmeans.mins = [1.0,2.0,9.0] kmeans.maxs = [7.0,8.0,9.0] new_centroids = [[1.0,2.0,0],[4.0,5.0,6.0],[7.0,8.0,9.0]] new_data_point_assignments = [[0],[1],[2]] new_distances = [0,0,0] kmeans.update() assert new_data_point_assignments == list(kmeans.data_point_assignments) assert numpy.array_equal(new_distances,kmeans.distances) assert numpy.array_equal(new_centroids,kmeans.centroids)
def test_cluster(): ### No missing values case. # Points 1,2 will first go to cluster 2, and point 3 to cluster 1. # Then point 1 will switch to cluster 1. X = [[2,5],[7,5],[2,3]] M = numpy.ones((3,2)) K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,2.0],[4.0,5.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1] expected_centroids = [[2.0,4.0],[7.0,5.0]] expected_cluster_assignments = [0,1,0] expected_data_point_assignments = [[0,2],[1]] expected_clustering_results = [[1,0],[0,1],[1,0]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results) ### Missing values case. # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1. # Then point 2 will switch to cluster 1. X = [[2,5],[3,-1],[10,1],[-1,2]] M = [[1,1],[1,0],[1,1],[0,1]] K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,7.0],[3.0,2.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1,-1] expected_centroids = [[2.5,5.0],[10.0,1.5]] expected_cluster_assignments = [0,0,1,1] expected_data_point_assignments = [[0,1],[2,3]] expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results) ### Cluster with 0 coordinate. # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4. X = [[2,5],[3,-1],[-1,1],[-1,2]] M = [[1,1],[1,0],[0,1],[0,1]] K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,7.0],[4.0,4.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1,-1] expected_centroids = [[2.5,5.0],[0,1.5]] expected_cluster_assignments = [0,0,1,1] expected_data_point_assignments = [[0,1],[2,3]] expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results)
def initialise(self, init_S='random', init_FG='random', expo_prior=1.): assert init_S in ['ones', 'random', 'exponential' ], "Unrecognised init option for S: %s." % init_S assert init_FG in ['ones', 'random', 'exponential', 'kmeans' ], "Unrecognised init option for F,G: %s." % init_FG if init_S == 'ones': self.S = numpy.ones((self.K, self.L)) elif init_S == 'random': self.S = numpy.random.rand(self.K, self.L) elif init_S == 'exponential': self.S = numpy.empty((self.K, self.L)) for k, l in itertools.product(xrange(0, self.K), xrange(0, self.L)): self.S[k, l] = exponential_draw(expo_prior) if init_FG == 'ones': self.F = numpy.ones((self.I, self.K)) self.G = numpy.ones((self.J, self.L)) elif init_FG == 'random': self.F = numpy.random.rand(self.I, self.K) self.G = numpy.random.rand(self.J, self.L) elif init_FG == 'exponential': self.F = numpy.empty((self.I, self.K)) self.G = numpy.empty((self.J, self.L)) for i, k in itertools.product(xrange(0, self.I), xrange(0, self.K)): self.F[i, k] = exponential_draw(expo_prior) for j, l in itertools.product(xrange(0, self.J), xrange(0, self.L)): self.G[j, l] = exponential_draw(expo_prior) elif init_FG == 'kmeans': print "Initialising F using KMeans." kmeans_F = KMeans(self.R, self.M, self.K) kmeans_F.initialise() kmeans_F.cluster() self.F = kmeans_F.clustering_results + 0.2 print "Initialising G using KMeans." kmeans_G = KMeans(self.R.T, self.M.T, self.L) kmeans_G.initialise() kmeans_G.cluster() self.G = kmeans_G.clustering_results + 0.2