def test_seeds(X, K): print("\n############## KMEAN K=" + str(K) + " ###############") mixture0, post0 = common.init(X, K, 0) mixture1, post1 = common.init(X, K, 1) mixture2, post2 = common.init(X, K, 2) mixture3, post3 = common.init(X, K, 3) mixture4, post4 = common.init(X, K, 4) cost0 = kmeans.run(X, mixture0, post0)[2] cost1 = kmeans.run(X, mixture1, post1)[2] cost2 = kmeans.run(X, mixture2, post2)[2] cost3 = kmeans.run(X, mixture3, post3)[2] cost4 = kmeans.run(X, mixture4, post4)[2] print("K=" + str(K) + " seed=0 : cost=" + str(cost0)) print("K=" + str(K) + " seed=1 : cost=" + str(cost1)) print("K=" + str(K) + " seed=2 : cost=" + str(cost2)) print("K=" + str(K) + " seed=3 : cost=" + str(cost3)) print("K=" + str(K) + " seed=4 : cost=" + str(cost4)) naive_em_estimate0 = naive_em.run(X, mixture0, post0) naive_em_estimate1 = naive_em.run(X, mixture1, post1) naive_em_estimate2 = naive_em.run(X, mixture2, post2) naive_em_estimate3 = naive_em.run(X, mixture3, post3) naive_em_estimate4 = naive_em.run(X, mixture4, post4) print("K=" + str(K) + " seed=0 : likelihood=" + str(naive_em_estimate0[2])) print("K=" + str(K) + " seed=1 : likelihood=" + str(naive_em_estimate1[2])) print("K=" + str(K) + " seed=2 : likelihood=" + str(naive_em_estimate2[2])) print("K=" + str(K) + " seed=3 : likelihood=" + str(naive_em_estimate3[2])) print("K=" + str(K) + " seed=4 : likelihood=" + str(naive_em_estimate4[2]))
def run_kmeans(X, plot=False): """ My solution: for i in range(len(K)): for j in range(len(seed)): mixture, post = common.init(X, K[i], seed[j]) mixture, post, cost = kmeans.run(X, mixture, post) print("K = {}, seed = {}, cost = {}".format(K[i], seed[j], cost)) if plot: common.plot(X, mixture, post, "K={}, seed={}".format(K[i], seed[j])) """ # Instructor's solution: for K in range(1, 5): min_cost = None best_seed = None for seed in range(0, 5): mixture, post = common.init(X, K, seed) mixture, post, cost = kmeans.run(X, mixture, post) if min_cost is None or cost < min_cost: min_cost = cost best_seed = seed mixture, post = common.init(X, K, best_seed) mixture, post, cost = kmeans.run(X, mixture, post) title = "K-means for K=, seed=, cost=".format(K, best_seed, min_cost) print(title) common.plot(X, mixture, post, title)
def run_kmean(X): for K in [1,2,3,4]: cost_list = [] for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, cost = kmeans.run(X, mixture, post) cost_list.append(cost) #common.plot(X, mixture, post, "{} means with seed{}".format(K, seed)) print("The cost of {} cluster is".format(K), min(cost_list)) best_seed = np.argmin(cost_list) for seed_ in [best_seed]: mixture, post = common.init(X, K, int(seed_)) mixture, post, cost = kmeans.run(X, mixture, post) common.plot(X, mixture, post, "{} means with seed{}".format(K, seed_)) return "Done"
def run_kmeans(): for K in range(1, 5): min_cost = None best_seed = None for seed in range(0, 5): mixture, post = common.init(X, K, seed) mixture, post, cost = kmeans.run(X, mixture, post) if min_cost is None or cost < min_cost: min_cost = cost best_seed = seed mixture, post = common.init(X, K, best_seed) mixture, post, cost = kmeans.run(X, mixture, post) title = "K-means for K={}, seed={} , cost= {}".format(K, best_seed, min_cost) common.plot(X, mixture, post, title)
def run(self): """ Main method that drives Spectral Co-Clustering. """ self.nfeatures = self.A.shape[0] self.ndocs = self.A.shape[1] self.logger.debug("Word By Documentmatrix A has dim:(%d,%d)", \ self.nfeatures, self.ndocs) self.logger.debug("Generating normalized Adjacency Matrix, A_n") self.gen_An() self.logger.debug("Finding SVD of An") un, s, vnt = spla.svd(self.An.todense()) self.logger.debug('Shape of un (%d,%d)', un.shape[0], un.shape[1]) vn = vnt.T self.logger.debug('Shape of vn (%d,%d)', vn.shape[1], vn.shape[1]) self.logger.debug("Generating Z matrix") self.get_Z(un, vn) data = (self.Z.T).tocsc() kmeans = kmeans.KMeans(data, self.k, self.n, self.delta, self.rc, \ self.cl, self.verbose) result = kmeans.run() self.centroids = result['centroids'] self.centroid_dict = result['centroiddict'] self.clusters = result['clusters'] self.cluster_dict = self._get_cluster_dict() self.logger.debug('Number of co-clusters produced: %d', \ len(self.clusters)) return {'centroids' : self.centroids, \ 'centroiddict' : self.centroid_dict, \ 'clusters' : self.clusters, \ 'clusterdict' : self.cluster_dict}
def flores_clustering_data_set_run(): data_set = arff.load(open('./data/flores_clustering.arff')) k = 3 results = kmeans.run(data_set['data'], k) print "Centroids" print results[0] print "Clusters" print results[1] print "Resolved in " + str(results[2]) + " iterations"
def clusteredThreeD(): m = 200 # sample size n = 3 # number of features K = 4 # number of clusters X = np.zeros((0, n)) centersOfMass = np.random.uniform(0, 100, (K, n)) for i in centersOfMass: stdDev = 12 samples = np.random.normal(i, stdDev, (int(m / K), n)) X = np.append(X, samples, axis=0) clusterings = kmeans.run(X, K)
def test_kmeans(): for k in [1, 2, 3, 4]: para_list = [] for seed in [0, 1, 2, 3, 4]: gm, post = common.init(X, k, seed) mixture, p, cost = kmeans.run(X, gm, post) para_list.append((mixture, p, cost)) max_para = max(para_list, key=lambda x: x[2]) common.plot(X, max_para[0], max_para[1], 'Kmeans on toy data with {k}'.format(k=k)) return max_para[0], max_para[1]
def execute(trial = False): startTime = datetime.datetime.now() # Setup and connect to mongo client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate(team_name, team_name) # Get all data needed accidents_data = [doc for doc in repo[accidents_collection].find()] if trial: # take 200 random records if in trial mode accidents_data = random.sample(accidents_data, 200) # list of all coordinate tuples P = [(doc['location']['coordinates'][0], doc['location']['coordinates'][1]) for doc in accidents_data] # Compute min and max coordinates minX = accidents_data[0]['location']['coordinates'][0] maxX = accidents_data[0]['location']['coordinates'][0] minY = accidents_data[0]['location']['coordinates'][1] maxY = accidents_data[0]['location']['coordinates'][1] for doc in accidents_data: if doc['location']['coordinates'][0] < minX: minX = doc['location']['coordinates'][0] if doc['location']['coordinates'][0] > maxX: maxX = doc['location']['coordinates'][0] if doc['location']['coordinates'][1] < minY: minY = doc['location']['coordinates'][1] if doc['location']['coordinates'][1] > maxY: maxY = doc['location']['coordinates'][1] # starting point M = [(minX, minY), (maxX, maxY)] # Run the algorithm clusters = kmeans.run(M,P) print("Final clusters:", clusters) # Save results to DB repo.dropPermanent(clusters_collection) repo.createPermanent(clusters_collection) doc = [{"loc": [x, y]} for (x, y) in clusters] repo[clusters_collection].insert_many(doc) # Wrap up.. repo.logout() endTime = datetime.datetime.now() return {"start":startTime, "end":endTime}
# Instantiate list to hold evaluation metrics over different values of k precision = [] max_precision = [] min_precision = [] std_precision = [] recall = [] max_recall = [] min_recall = [] std_recall = [] fscore = [] max_fscore = [] min_fscore = [] std_fscore = [] RI = [] max_RI = [] min_RI = [] std_RI = [] epoch = [] max_epoch = [] min_epoch = [] # Train our classifier for all values of k print("Running algorithm with k = " + str(k) + "\n") # Run k-Means algorithm precisions, recalls, fscores, ris, epochs = kmeans.run(train_data, classes, 4, n_runs, distance_measure)
#%% import numpy as np import kmeans import common import naive_em import em #%% X = np.loadtxt("toy_data.txt") for K in range(1,5): for seed in range(0,5): title = "K=" + str(K) + ", seed=" + str(seed) M, P = common.init(X, K, seed) cost = kmeans.run(X, M, P) print(title, cost[2]) #common.plot(X, M, P, title) # %%
#23 : 101928.961581 #24 : 105803.434798 #25 : 106071.097392 #26 : 108282.084023 #27 : 105008.134663 #28 : 105096.342568 #29 : 102076.680087 #30 : 106594.176483 #From this data it is clear to see that after k=5 there is no significant gain #for increasing values of k. Thus k=5 is probably the best fit for our data. import kmeans import sys def test(iter = 20, k = 30) try: iterations = int(sys.argv[1]) numk = int(sys.argv[2])+1 except ValueError: print >> sys.stderr, "Invalid input, usage python testkmeans.py [#iters] [#ks]" sys.exit(1) ks = [sys.maxint] * (numk) for k in range(1,numk): for i in range(iterations): newk = kmeans.run(k, True) if newk < ks[k]: ks[k] = newk for i, k in enumerate(ks[1:], 1): print i, ":", k
import kmeans file = open('models/starspace.txt') X = [] for i, line in enumerate(file): should_continue = i < 4 or i % 2 != 0 if should_continue: continue vector = [float(chunk) for chunk in line.split()] X.append(vector) kmeans.run(X)
c='#00CED1') plt.scatter(y2[:, 0], y2[:, 1], c='#00CED1', linewidths=line3) plt.legend(('points', 'centers', 'membership grade')) plt.title('u of No.2 center') plt.subplot(133) plt.plot(x, fcm_distance[1:], c='black') plt.title('Cumulative distance') name = 'fig' + str(m) plt.savefig(name) plt.show() n_samples = 50 #centerbox= [(-5,0),(5,0)] #point,_ = make_blobs(n_samples=100, n_features=2, cluster_std=1.6,center_box=centerbox, shuffle=False, random_state=42) point = np.zeros((n_samples, 2)) for i in range(25): point[i][0] = random.randint(0, 45) point[-i][0] = random.randint(55, 100) run(2, point) run(3, point) run(4, point) run(5, point) run(10, point) run(100, point) kmeans.run(2, point)
import json from kmeans import Point, run if __name__ == "__main__": sortedPoints = lambda ps: sorted(ps, key=lambda p: (p.x, p.y)) with open("../points.json") as f: points = map(lambda x: Point(x[0], x[1]), json.loads(f.read())) result = run(points, 10) for k in sortedPoints(result.keys()): print "==\n# %s #" % k print '\n'.join(" " + str(p) for p in sortedPoints(result[k]))
import common import naive_em import em X = np.loadtxt("toy_data.txt") ######### Section 2: K-means ############ print("******* Section 2 *******\n ") K = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] costs_kMeans = [0, 0, 0, 0, 0] for k in range(len(K)): for i in range(len(seeds)): _, _, costs_kMeans[i] = kmeans.run(X, *common.init(X, K[k], seeds[i])) print("----- Clusters", k + 1, " -----") print("Lowest cost: ", np.min(costs_kMeans)) print("Best seed: ", np.argmin(costs_kMeans)) print("******* End of section 2 *******\n ") ######### Section 4: Comparing K-means and EM ############ print("******* Section 4 *******\n ") costs_EM = [0, 0, 0, 0, 0] mixtures_EM = [0, 0, 0, 0, 0] # Mixtures for best seed bic = [0., 0., 0., 0.] # BIC for best cluster for k in range(len(K)): for i in range(len(seeds)):
def uniformTwoD(): m = 100 # sample size n = 2 # number of features K = 3 # number of clusters X = np.random.uniform(0, 100, (m, n)) clusterings = kmeans.run(X, K)
plt.show() X = np.loadtxt("toy_data.txt") K = [1, 2, 3, 4] # TODO: Your code here costs = [] loglikelihoods = [] bics = [] for k in K: cost_seeds_ = [] log_likelihood_ = [] bic_ = [] for seed in range(4): gauss_mixture, post = common.init(X=X, K=k, seed=seed) gauss_mixture_kmeans, post_kmeans, cost = kmeans.run( X=X, mixture=gauss_mixture, post=post) #print('for k =',k, "and seed=",seed, end=" ") #print("cost=",cost) gauss_mixture_em, post_em, loglikelihood = naive_em.run( X, gauss_mixture, post) bic_.append(common.bic(X, gauss_mixture_em, loglikelihood)) log_likelihood_.append(loglikelihood) cost_seeds_.append(cost) # plot_points(X,post_kmeans, # title="kmeans with k:"+str(k)+" seed:"+str(seed)) # plot_points(X,post_em, # title="em with k:"+str(k)+" seed:"+str(seed)) bics.append(bic_) costs.append(cost_seeds_) loglikelihoods.append(log_likelihood_)
try: import kmeans import common import naive_em import em except ModuleNotFoundError: import FromLinearModelsToDeepLearning.unit_4.netflix.kmeans as kmeans import FromLinearModelsToDeepLearning.unit_4.netflix.common as common import FromLinearModelsToDeepLearning.unit_4.netflix.naive_em as naive_em import FromLinearModelsToDeepLearning.unit_4.netflix.em as em X = np.loadtxt(r'C:\Users\sam\Documents\Trainings\FromLinearModelsToDeepLearning\FromLinearModelsToDeepLearning\unit_4\netflix\toy_data.txt') seeds = [0,1,2,3,4] mixture, post = common.init(X, 4, 0) mixture, post, cost = kmeans.run(X,mixture, post ) ks = [1,2,3,4] from collections import namedtuple results = namedtuple('results', 'k seed cost') costs =[] for k in ks: for seed in seeds: mixture, post = common.init(X, k, seed) mixture, post, cost = kmeans.run(X, mixture, post) r = results(k,seed,cost) costs.append(r) print(r) def get_best_cost_for_k(costs,k): best_cost = np.float('inf')
bestseed_EM = [0, 0, 0, 0] #Mixture for Best Seed for Algo mixture_kmeans = [0, 0, 0, 0, 0] mixture_EM = [0, 0, 0, 0, 0] # Posterior probs. for best seeds post_kmeans = [0, 0, 0, 0, 0] post_EM = [0, 0, 0, 0, 0] # BIC score of cluster bic = [0., 0., 0., 0.] for k in range(len(K)): for i in range(len(seeds)): mixture_kmeans[i], post_kmeans[i], cost_kmeans[i] = kmeans.run( X, *common.init(X, K[k], seeds[i])) mixture_EM[i], post_EM[i], cost_EM[i] = naive_em.run( X, *common.init(X, K[k], seeds[i])) print("=============== Clusters:", k + 1, "======================") print("Lowest cost using kMeans is:", np.min(cost_kmeans)) print("Lowest cost using EM is:", np.max(cost_EM)) #Save best seed for plotting bestseed_kmeans[k] = np.argmin(cost_kmeans) bestseed_EM[k] = np.argmax(cost_EM) common.plot(X, mixture_kmeans[bestseed_kmeans[k]], post_kmeans[bestseed_kmeans[k]], title="kmeans")
def k_means_function(X, K, seed): init_model = common.init(X, K, seed) mixture, post, cost = kmeans.run(X, init_model[0], init_model[1]) return mixture, post, cost
import common import naive_em import em X = np.loadtxt("toy_data.txt") Ks = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] BICs = np.empty(len(Ks)) for i, K in enumerate(Ks): k_best_mix, k_best_post, k_best_cost = None, None, np.inf em_best_mix, em_best_post, em_best_ll = None, None, -np.inf for seed in seeds: init_mix, init_post = common.init(X, K, seed) k_mix, k_post, k_cost = kmeans.run(X, init_mix, init_post) em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post) if k_cost < k_best_cost: k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll BICs[i] = common.bic(X, em_best_mix, em_best_ll) common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K)) common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K)) print("BICs: ", BICs) print("Best BIC: ", np.max(BICs)) print("Best K: ", Ks[np.argmax(BICs)]) X = np.loadtxt("netflix_incomplete.txt")
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("toy_data.txt") # TODO: Your code here for i in range(1, 5): costs = [] for j in range(5): mixture, post = common.init(X, i, j) _, _, cost = kmeans.run(X, mixture, 0) costs.append(cost) common.plot(X, mixture, post, 'test') print(min(costs))
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("datas/toy_data.txt") K = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] for k in K: KM_best_mixture, KM_best_post, KM_best_cost = None, None, np.inf EM_best_mixture, EM_best_post, EM_best_logvrais = None, None, -np.inf for seed in seeds: init_mixture, init_post = common.init(X, k, seed) # Modèle KMeans KM_mixture, KM_post, KM_cost = kmeans.run(X, init_mixture, init_post) if KM_cost < KM_best_cost: KM_best_mixture, KM_best_post, KM_best_cost = KM_mixture, KM_post, KM_cost # Modèle EM EM_mixture, EM_post, EM_logvrais = naive_em.run(X, init_mixture, init_post) if EM_logvrais > EM_best_logvrais: EM_best_mixture, EM_best_post, EM_best_logvrais = EM_mixture, EM_post, EM_logvrais common.plot(X, KM_best_mixture, KM_best_post, f"K-means K={k}") common.plot(X, EM_best_mixture, EM_best_post, f"EM K={k}")
import clean import kmeans import merge import zoning exec(open('../pymongo_dm.py').read()) # connect to DBMS print("Connecting to the DBMS...") client = pymongo.MongoClient() repo = client.repo repo.authenticate('djmcc_jasper', 'djmcc_jasper') # execute scripts reset.run(repo) get.run(repo) clean.run(repo) merge.run(repo) kmeans.run(repo) zoning.run(repo) # disconnect from the DBMS print("Disconnecting from the DBMS...") repo.logout() # EOF
mixtures_kMeans = [0, 0, 0, 0, 0] mixtures_EM = [0, 0, 0, 0, 0] # Posterior probs. for best seeds posts_kMeans = [0, 0, 0, 0, 0] posts_EM = [0, 0, 0, 0, 0] # BIC score of cluster bic = [0., 0., 0., 0.] for k in range(len(K)): for i in range(len(seeds)): # Run kMeans mixtures_kMeans[i], posts_kMeans[i], costs_kMeans[i] = \ kmeans.run(X, *common.init(X, K[k], seeds[i])) # Run Naive EM mixtures_EM[i], posts_EM[i], costs_EM[i] = \ naive_em.run(X, *common.init(X, K[k], seeds[i])) # Print lowest cost print("=============== Clusters:", k + 1, "======================") print("Lowest cost using kMeans is:", np.min(costs_kMeans)) print("Highest log likelihood using EM is:", np.max(costs_EM)) # Save best seed for plotting best_seed_kMeans[k] = np.argmin(costs_kMeans) best_seed_EM[k] = np.argmax(costs_EM) # Plot kMeans and EM results
import naive_em import em from scipy.stats import multivariate_normal X = np.loadtxt("toy_data.txt") Ks = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] # ============================================================================= # 2. K-means # ============================================================================= for K in Ks: for seed in seeds: mixture, post = common.init(X, K, seed=seed) # Initialize K-means mixture, post, cost = kmeans.run(X, mixture, post) # K-means common.plot(X, mixture, post, [K, seed]) # Plot initialization print(cost) # ============================================================================= # 3. Expectation–maximization algorithm # ============================================================================= def test_2dgaussian_pdf(X, mu, var): y1 = naive_em.pdf_2dgaussian(X, mu, var) y2 = multivariate_normal.pdf(X, mean=mu.reshape(2, ), cov=var[0]) return all(y1 - y2) < 1e-6 # 2dgaussian
print('\n----- K-Means Algorithm -----\n') seeds = [0, 1, 2, 3, 4] K = [1, 2, 3, 4] for k in K: mixtures = [] posts = [] costs = np.empty(len(seeds)) for i, seed in enumerate(seeds): # initialize mixture model with random points mixture, post = common.init(X, K=k, seed=seed) # run k-means mixture, post, cost = kmeans.run(X, mixture=mixture, post=post) mixtures.append(mixture) posts.append(post) costs[i] = cost best_seed = np.argmin(costs) cost = costs[best_seed] mixture = mixtures[best_seed] post = posts[best_seed] print(f'K={k}', f'Best seed: {best_seed}', f'Cost: {cost}') #common.plot(X, mixture, post, title=f"K-Means, K={k}") # -----------------------------------
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("toy_data.txt") # TODO: Your code here K = np.array([1, 2, 3, 4]) seeds = np.array([0, 1, 2, 3, 4]) for i in seeds: mixture = common.init(X, K[3], i)[0] post = common.init(X, K[3], i)[1] [mixture, post, cost] = kmeans.run(X, mixture, post) # common.plot(X, mixture, post)