def test_seeds(X, K): print("\n############## KMEAN K=" + str(K) + " ###############") mixture0, post0 = common.init(X, K, 0) mixture1, post1 = common.init(X, K, 1) mixture2, post2 = common.init(X, K, 2) mixture3, post3 = common.init(X, K, 3) mixture4, post4 = common.init(X, K, 4) cost0 = kmeans.run(X, mixture0, post0)[2] cost1 = kmeans.run(X, mixture1, post1)[2] cost2 = kmeans.run(X, mixture2, post2)[2] cost3 = kmeans.run(X, mixture3, post3)[2] cost4 = kmeans.run(X, mixture4, post4)[2] print("K=" + str(K) + " seed=0 : cost=" + str(cost0)) print("K=" + str(K) + " seed=1 : cost=" + str(cost1)) print("K=" + str(K) + " seed=2 : cost=" + str(cost2)) print("K=" + str(K) + " seed=3 : cost=" + str(cost3)) print("K=" + str(K) + " seed=4 : cost=" + str(cost4)) naive_em_estimate0 = naive_em.run(X, mixture0, post0) naive_em_estimate1 = naive_em.run(X, mixture1, post1) naive_em_estimate2 = naive_em.run(X, mixture2, post2) naive_em_estimate3 = naive_em.run(X, mixture3, post3) naive_em_estimate4 = naive_em.run(X, mixture4, post4) print("K=" + str(K) + " seed=0 : likelihood=" + str(naive_em_estimate0[2])) print("K=" + str(K) + " seed=1 : likelihood=" + str(naive_em_estimate1[2])) print("K=" + str(K) + " seed=2 : likelihood=" + str(naive_em_estimate2[2])) print("K=" + str(K) + " seed=3 : likelihood=" + str(naive_em_estimate3[2])) print("K=" + str(K) + " seed=4 : likelihood=" + str(naive_em_estimate4[2]))
def run_naive_em(X): for K in [1,2,3,4]: likelihood_ls = [] for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, LL = naive_em.run(X,mixture,post) likelihood_ls.append(LL) print("The likelihood of {} cluster is".format(K), max(likelihood_ls)) best_seed = np.argmax(likelihood_ls) for seed_ in [best_seed]: mixture, post = common.init(X, K, int(seed_)) mixture, post, LL= naive_em.run(X, mixture, post) common.plot(X, mixture, post, "{} mixtures with seed{}".format(K, seed_)) return "Done"
def run_naive_em(): for K in range(1, 5): max_ll = None best_seed = None for seed in range(0, 5): mixture, post = common.init(X, K, seed) mixture, post, ll = naive_em.run(X, mixture, post) if max_ll is None or ll > max_ll: max_ll = ll best_seed = seed mixture, post = common.init(X, K, best_seed) mixture, post, ll = naive_em.run(X, mixture, post) title = "EM for K={}, seed={}, ll={}".format(K, best_seed, ll) print(title) common.plot(X, mixture, post, title)
def select_best_bic(X): bic_ls = [] for K in [1,2,3,4]: likelihood_ls = [] bic_ls_seed = [] for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, LL = naive_em.run(X,mixture,post) likelihood_ls.append(LL) bic_ls_seed.append(common.bic(X, mixture, LL)) best_seed = np.argmax(bic_ls_seed) mixture, post = common.init(X, K, int(best_seed)) mixture, post, LL = naive_em.run(X, mixture, post) bic_ls.append(common.bic(X,mixture,LL)) print("The best K is {} with bic {}".format(np.argmax(bic_ls)+1, max(bic_ls))) return "Done"
def run_naive_em_with_bic(): max_bic = None for K in range(1, 5): max_ll = None best_seed = None for seed in range(0, 5): mixture, post = common.init(X, K, seed) mixture, post, ll = naive_em.run(X, mixture, post) if max_ll is None or ll > max_ll: max_ll = ll best_seed = seed mixture, post = common.init(X, K, best_seed) mixture, post, ll = naive_em.run(X, mixture, post) bic = common.bic(X, mixture, ll) if max_bic is None or bic > max_bic: max_bic = bic title = "EM for K={}, seed={}, ll={}, bic={}".format(K, best_seed, ll, bic) print(title) common.plot(X, mixture, post, title)
def test_naive_em(): for k in [1, 2, 3, 4]: para_list = [] for seed in [0, 1, 2, 3, 4]: gm, post = common.init(X, k, seed) mixture, p, cost = naive_em.run(X, gm, post) para_list.append((mixture, p, cost)) max_para = max(para_list, key=lambda x: x[2]) common.plot(X, max_para[0], max_para[1], 'EM on toy data with {k}'.format(k=k)) return max_para[0], max_para[1]
def run_naive_em(X, plot=False): max_bic = None for i in range(len(K)): max_ln_like = None best_seed = None for j in range(len(seed)): mixture, post = common.init(X, K[i], seed[j]) mixture, post, ln_like = naive_em.run(X, mixture, post) if max_ln_like is None or ln_like > max_ln_like: max_ln_like = ln_like best_seed = seed[j] if plot: common.plot(X, mixture, post, "K={}, seed={}".format(K[i], seed[j])) mixture, post = common.init(X, K[i], best_seed) mixture, post, ln_like = naive_em.run(X, mixture, post) bic = common.bic(X, mixture, ln_like) if max_bic is None or bic > max_bic: max_bic = bic print("K = {}, Max ln(likelihood) = {}, Best seed = {}, Max BIC = {}". format(K[i], max_ln_like, best_seed, max_bic))
def select_k_em(): """ Select the best K based on BIC :return: """ for k in [1, 2, 3, 4]: para_list = [] for seed in [0, 1, 2, 3, 4]: gm, post = common.init(X, k, seed) mixture, p, cost = naive_em.run(X, gm, post) para_list.append((mixture, p, cost)) max_para = max(para_list, key=lambda x: x[2]) print(common.bic(X, max_para[0], max_para[2]))
# 2dgaussian mixture, post = common.init(X, 1) mu, var, p = mixture test_2dgaussian_pdf(X, mu, var) # E_step mixture, post = common.init(X, 3, seed=0) mu, var, p = mixture post, log_likelihood = naive_em.estep(X, mixture) # M_step mixture = naive_em.mstep(X, post) # RUN mixture, post = common.init(X, 3, seed=0) mixture, post, log_likelihood = naive_em.run(X, mixture, post) # ============================================================================= # 4. Comparing K-means and EM # ============================================================================= for K in Ks: for seed in seeds: mixture, post = common.init(X, K=K, seed=seed) # Initialize K-means mixture, post, log_likelihood = naive_em.run(X, mixture, post) common.plot(X, mixture, post, [K, seed]) print(K, seed, log_likelihood) # ============================================================================= # 5. Bayesian Information Criterion # Picking the best K
def run_full_em(self, X, K, seed, expected_cost): mixture, post = common.init(X, K, seed) new_mixture, soft_counts, cost = naive_em.run(X, mixture, post) self.assertEqual(np.isclose(cost, expected_cost), True, f'Cost: got {cost}, expected {expected_cost}')
print("----- Clusters", k + 1, " -----") print("Lowest cost: ", np.min(costs_kMeans)) print("Best seed: ", np.argmin(costs_kMeans)) print("******* End of section 2 *******\n ") ######### Section 4: Comparing K-means and EM ############ print("******* Section 4 *******\n ") costs_EM = [0, 0, 0, 0, 0] mixtures_EM = [0, 0, 0, 0, 0] # Mixtures for best seed bic = [0., 0., 0., 0.] # BIC for best cluster for k in range(len(K)): for i in range(len(seeds)): mixtures_EM[i], _, costs_EM[i] = naive_em.run( X, *common.init(X, K[k], seeds[i])) bic[k] = common.bic(X, mixtures_EM[np.argmax(costs_EM)], np.max(costs_EM)) print("----- Mixture ", k + 1, " -----") print("Highest log: ", np.max(costs_EM)) print("Best seed: ", np.argmax(costs_EM)) print("******* End of section 4 *******\n ") ######### Section 5: Bayesian Information Criterion ############ print("******* Section 5 *******\n ") print("Best K: ", np.argmax(bic) + 1) print("BIC for the best K: ", np.max(bic)) print("******* End of section 5 *******\n ")
X = np.loadtxt("toy_data.txt") K = [1, 2, 3, 4] # TODO: Your code here costs = [] loglikelihoods = [] bics = [] for k in K: cost_seeds_ = [] log_likelihood_ = [] bic_ = [] for seed in range(4): gauss_mixture, post = common.init(X=X, K=k, seed=seed) gauss_mixture_kmeans, post_kmeans, cost = kmeans.run( X=X, mixture=gauss_mixture, post=post) #print('for k =',k, "and seed=",seed, end=" ") #print("cost=",cost) gauss_mixture_em, post_em, loglikelihood = naive_em.run( X, gauss_mixture, post) bic_.append(common.bic(X, gauss_mixture_em, loglikelihood)) log_likelihood_.append(loglikelihood) cost_seeds_.append(cost) # plot_points(X,post_kmeans, # title="kmeans with k:"+str(k)+" seed:"+str(seed)) # plot_points(X,post_em, # title="em with k:"+str(k)+" seed:"+str(seed)) bics.append(bic_) costs.append(cost_seeds_) loglikelihoods.append(log_likelihood_)
#Mixture for Best Seed for Algo mixture_kmeans = [0, 0, 0, 0, 0] mixture_EM = [0, 0, 0, 0, 0] # Posterior probs. for best seeds post_kmeans = [0, 0, 0, 0, 0] post_EM = [0, 0, 0, 0, 0] # BIC score of cluster bic = [0., 0., 0., 0.] for k in range(len(K)): for i in range(len(seeds)): mixture_kmeans[i], post_kmeans[i], cost_kmeans[i] = kmeans.run( X, *common.init(X, K[k], seeds[i])) mixture_EM[i], post_EM[i], cost_EM[i] = naive_em.run( X, *common.init(X, K[k], seeds[i])) print("=============== Clusters:", k + 1, "======================") print("Lowest cost using kMeans is:", np.min(cost_kmeans)) print("Lowest cost using EM is:", np.max(cost_EM)) #Save best seed for plotting bestseed_kmeans[k] = np.argmin(cost_kmeans) bestseed_EM[k] = np.argmax(cost_EM) common.plot(X, mixture_kmeans[bestseed_kmeans[k]], post_kmeans[bestseed_kmeans[k]], title="kmeans") common.plot(X,
mixtures, posts, costs = [], [], [] for seed_i in range(seeds.shape[0]): mixture, post = common.init(X, k, seeds[seed_i]) mixture, post, cost = kmeans.run(X, mixture, post) mixtures.append(mixture) posts.append(post) costs.append(cost) if seed_i > 0 and cost < costs[seed_i - 1]: min_cost_seed_i = seed_i common.plot(X, mixtures[min_cost_seed_i], posts[min_cost_seed_i], "k-mean k:" + str(k) + " seed:" + str(min_cost_seed_i)) print(k, cost, min_cost_seed_i) for k in K: seeds = np.array([0, 1, 2, 3, 4]) #k_cost = np.zeros((seeds.shape[0], 2)) min_cost_seed_i = 0 mixtures, posts, costs = [], [], [] for seed_i in range(seeds.shape[0]): mixture, post = common.init(X, k, seeds[seed_i]) mixture, post, cost = naive_em.run(X, mixture, post) mixtures.append(mixture) posts.append(post) costs.append(cost) if seed_i > 0 and cost > costs[seed_i - 1]: min_cost_seed_i = seed_i common.plot(X, mixtures[min_cost_seed_i], posts[min_cost_seed_i], "EM k:" + str(k) + " seed:" + str(min_cost_seed_i)) print(k, cost, min_cost_seed_i)
print("K=" + str(K) + " seed=2 : likelihood=" + str(cost2)) print("K=" + str(K) + " seed=3 : likelihood=" + str(cost3)) print("K=" + str(K) + " seed=4 : likelihood=" + str(cost4)) # K mean initialization test_seeds(toy_X, 1) test_seeds(toy_X, 2) test_seeds(toy_X, 3) test_seeds(toy_X, 4) # EM algo print("############## EM Algorythme implemented ###############") mixture, post = common.init(toy_X, 3, 0) naive_em_estimate = naive_em.run(toy_X, mixture, post)[2] print("naive EM log likelihood : " + str(naive_em_estimate)) print("############## Some Tests ######################") initialMixture, initialPost = common.init(toy_X, 1, 0) mixtureEM1, postEM1, ll1 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 2, 0) mixtureEM2, postEM2, ll2 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 3, 0) mixtureEM3, postEM3, ll3 = naive_em.run(toy_X, initialMixture, initialPost) initialMixture, initialPost = common.init(toy_X, 4, 0) mixtureEM4, postEM4, ll4 = naive_em.run(toy_X, initialMixture, initialPost)
posts_kMeans = [0, 0, 0, 0, 0] posts_EM = [0, 0, 0, 0, 0] # BIC score of cluster bic = [0., 0., 0., 0.] for k in range(len(K)): for i in range(len(seeds)): # Run kMeans mixtures_kMeans[i], posts_kMeans[i], costs_kMeans[i] = \ kmeans.run(X, *common.init(X, K[k], seeds[i])) # Run Naive EM mixtures_EM[i], posts_EM[i], costs_EM[i] = \ naive_em.run(X, *common.init(X, K[k], seeds[i])) # Print lowest cost print("=============== Clusters:", k + 1, "======================") print("Lowest cost using kMeans is:", np.min(costs_kMeans)) print("Highest log likelihood using EM is:", np.max(costs_EM)) # Save best seed for plotting best_seed_kMeans[k] = np.argmin(costs_kMeans) best_seed_EM[k] = np.argmax(costs_EM) # Plot kMeans and EM results common.plot(X, mixtures_kMeans[best_seed_kMeans[k]], posts_kMeans[best_seed_kMeans[k]], title="kMeans")
def test_step(): mixture, post = common.init(X, 3, 0) mixture, soft_counts, ll = naive_em.run(X, mixture, post) print("Log-likelihood: {}".format(ll))
import numpy as np import em import naive_em import common # X = np.loadtxt("test_incomplete.txt") # X_gold = np.loadtxt("test_complete.txt") testcase = 2 if (testcase == 1): # for naive_em X = np.loadtxt("toy_data.txt") K = 3 seed = 0 n, d = X.shape mixture, post = common.init(X, K, seed) mixture, post, ll = naive_em.run(X, mixture, post) result = "with naive_em, ll = {}".format(ll) print(result) if (testcase == 2): X = np.loadtxt("netflix_incomplete.txt") # X = np.loadtxt("toy_data.txt") n, d = X.shape for K in [1, 12]: max_ll = None for seed in range(0, 5): ll = None mixture, post = common.init(X, K, seed) mixture, post, ll = em.run(X, mixture, post) if max_ll is None or ll > max_ll: max_ll = ll
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("netflix_complete.txt") K = 12 # TODO: Your code here for seed in range(5): mixtures , post = common.init(X , K , seed) # m, p, cost = kmeans.run(X , mixtures , post) # print (cost) # common.plot(X , mixtures , post , "Title") m, p, cost = naive_em.run(X , mixtures , post) print (common.bic(X , m , cost)) # common.plot(X , mixtures , post , "Title")
import naive_em import em X = np.loadtxt("toy_data.txt") Ks = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] BICs = np.empty(len(Ks)) for i, K in enumerate(Ks): k_best_mix, k_best_post, k_best_cost = None, None, np.inf em_best_mix, em_best_post, em_best_ll = None, None, -np.inf for seed in seeds: init_mix, init_post = common.init(X, K, seed) k_mix, k_post, k_cost = kmeans.run(X, init_mix, init_post) em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post) if k_cost < k_best_cost: k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll BICs[i] = common.bic(X, em_best_mix, em_best_ll) common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K)) common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K)) print("BICs: ", BICs) print("Best BIC: ", np.max(BICs)) print("Best K: ", Ks[np.argmax(BICs)]) X = np.loadtxt("netflix_incomplete.txt") K = 12
def naive_em_function(X, K, seed): init_model = common.init(X, K, seed) mixture, post, cost = naive_em.run(X, init_model[0], init_model[1]) return mixture, post, cost
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("datas/toy_data.txt") K = [1, 2, 3, 4] seeds = [0, 1, 2, 3, 4] for k in K: KM_best_mixture, KM_best_post, KM_best_cost = None, None, np.inf EM_best_mixture, EM_best_post, EM_best_logvrais = None, None, -np.inf for seed in seeds: init_mixture, init_post = common.init(X, k, seed) # Modèle KMeans KM_mixture, KM_post, KM_cost = kmeans.run(X, init_mixture, init_post) if KM_cost < KM_best_cost: KM_best_mixture, KM_best_post, KM_best_cost = KM_mixture, KM_post, KM_cost # Modèle EM EM_mixture, EM_post, EM_logvrais = naive_em.run(X, init_mixture, init_post) if EM_logvrais > EM_best_logvrais: EM_best_mixture, EM_best_post, EM_best_logvrais = EM_mixture, EM_post, EM_logvrais common.plot(X, KM_best_mixture, KM_best_post, f"K-means K={k}") common.plot(X, EM_best_mixture, EM_best_post, f"EM K={k}")
import kmeans import common import naive_em import em X = np.loadtxt("toy_data.txt") seeds = [0, 1, 2, 3, 4] K = [1, 2, 3, 4] kbest = 1 bestbic = -100000000 for k in K: best = 100000000 seed_best = 0 for seed in seeds: mixtures, post = common.init(X, k, seed) tupl = naive_em.run(X, mixtures, None) if (best > tupl[2]): best = tupl[2] seed_best = seed mixtures, post = common.init(X, k, seed_best) tupl = naive_em.run(X, mixtures, None) bi = common.bic(X, mixtures, tupl[2]) if (bi > bestbic): bestbic = bi kbest = k print(kbest) print(bestbic)
mixtures_em = [] posts_em = [] costs_em = np.empty(len(seeds)) logloss = np.empty(len(seeds)) for i, seed in enumerate(seeds): # initialize mixture model with random points # init(X,K) returns a K-component mixture model with means, variances and mixing proportions. mixture, post = common.init(X, K=k, seed=seed) mixture_em, post_em = common.init(X, K=k, seed=seed) # For EM algorithm initialisation # run k-means function mixture, post, cost = kmeans.run(X, mixture=mixture, post=post) # run EM Algo function mixture_em, post_em, ll = naive_em.run(X, mixture=mixture_em, post=post_em) # Update k-means values mixtures.append(mixture) posts.append(post) costs[i] = cost # print(k, seed, costs) # Update EM values mixtures_em.append(mixture_em) posts_em.append(post_em) logloss[i] = ll # print(k, seed, costs_em) # Finding the best/min cost of k-means
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("toy_data.txt") K = 4 seeds = [0, 1, 2, 3, 4] for seed in seeds: mixture, post = common.init(X, K, seed) # kmixture, kpost, kcost = kmeans.run(X, mixture, post) # title = f"K is {K}, seed is {seed}, cost is {kcost}" em_mixture, em_post, em_cost = naive_em.run(X, mixture, post) with_bic = common.bic(X, em_mixture, em_cost) title = f"K is {K}, seed is {seed}, em_cost is {em_cost}, with_bic is {with_bic}" print(title) common.plot(X, em_mixture, em_post, title) # TODO: Your code here
seeds = [0, 1, 2, 3, 4] K = [1, 2, 3, 4] bic = np.zeros(len(K)) for j, k in enumerate(K): mixtures = [] posts = [] logloss = np.empty(len(seeds)) for i, seed in enumerate(seeds): # initialize mixture model with random points mixture, post = common.init(X, K=k, seed=seed) # run EM-algorithm mixture, post, LL = naive_em.run(X, mixture=mixture, post=post) mixtures.append(mixture) posts.append(post) logloss[i] = LL #print('K=', k, 'seed=', seed, 'logloss=', LL) best_seed = np.argmax(logloss) logloss = logloss[best_seed] mixture = mixtures[best_seed] post = posts[best_seed] current_bic = common.bic(X, mixture, logloss) bic[j] = current_bic print(f'K={k}', f'Best seed={best_seed}', f'logloss={logloss}', f'BIC={current_bic}')
import numpy as np import kmeans import common import naive_em import em X = np.loadtxt("toy_data.txt") for i in range(4): for j in range(5): initial_mixture, post = common.init(X, i + 1, j) #M, L, cost_final = kmeans.run(X, initial_mixture, post) #title = "K means for K "+str(i+1)+" seed " +str(j) #common.plot(X, M, L, title) #print("For K "+ str(i+1) + " seed " + str(j) +" cost is " + str(cost_final)) M, L, likelihood = naive_em.run(X, initial_mixture, post) bic = common.bic(X, M, likelihood) title = "EM for K " + str(i + 1) + " seed " + str(j) common.plot(X, M, L, title) print("For K " + str(i + 1) + " seed " + str(j) + " likelihood is " + str(likelihood) + " bic is " + str(bic))