def test_em(): init_mixture, post = common.init(X, K, seed) mixture, post, c = em.run(X, init_mixture, post) prediction = em.fill_matrix(X, mixture) print(c) print(common.rmse(prediction, X_gold))
def run_matrix_completion(): K = 12 seed = 1 mixture, post = common.init(X, K, seed) mixture, post, ll = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) X_gold = np.loadtxt('netflix_complete.txt') print("RMSE:", common.rmse(X_gold, X_pred))
def test_run_test_solution(self): X, mixture, post = ts.X, ts.mixture_first_run, ts.post_first_run expected_cost = ts.ll_first_run new_mixture, cost = em.run(X, mixture, post) self.assertEqual(np.isclose(cost, expected_cost), True, f'Cost: got {cost}, expected {expected_cost}')
def best_run_em(X): K = 12 dict = {} likelihood_ls = [] for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, LL = em.run(X, mixture, post) dict[LL] = mixture return dict[min(dict.keys())]
def best_run_em(X): K = 12 dict = {} for seed in range(5): np.random.seed(seed) mixture, post = common.init(X, K, seed) mixture, post, LL = em.run(X, mixture, post) dict[LL] = (mixture, seed) return dict[min(dict.keys())]
def test_incomplete_em(): for k_s in [1, 12]: lps = [] for s in [0, 1, 2, 3, 4]: print(k_s, s) init_mixture, post = common.init(X, k_s, s) model = em.run(X, init_mixture, post) lps.append(model) best = max(lps, key=lambda x: x[-1]) print(best[-1])
def test_k12(): lls = [] for s in [0, 1, 2, 3, 4]: print(s) init_mixture, post = common.init(X, 12, s) model = em.run(X, init_mixture, post) lls.append(model) m, p, l = max(lls, key=lambda x: x[-1]) prediction = em.fill_matrix(X, m) return common.rmse(prediction, X_gold)
def run_em(X): for K in [1, 12]: likelihood_ls = [] for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, LL = em.run(X, mixture, post) likelihood_ls.append(LL) print("The likelihood of {} cluster is".format(K), max(likelihood_ls)) return "Done"
def test_em_seeds(X, K): print("\n############## EM K=" + str(K) + " ###############") mixture0, post0 = common.init(X, K, 0) mixture1, post1 = common.init(X, K, 1) mixture2, post2 = common.init(X, K, 2) mixture3, post3 = common.init(X, K, 3) mixture4, post4 = common.init(X, K, 4) cost0 = em.run(X, mixture0, post0)[2] cost1 = em.run(X, mixture1, post1)[2] cost2 = em.run(X, mixture2, post2)[2] cost3 = em.run(X, mixture3, post3)[2] cost4 = em.run(X, mixture4, post4)[2] print("K=" + str(K) + " seed=0 : likelihood=" + str(cost0)) print("K=" + str(K) + " seed=1 : likelihood=" + str(cost1)) print("K=" + str(K) + " seed=2 : likelihood=" + str(cost2)) print("K=" + str(K) + " seed=3 : likelihood=" + str(cost3)) print("K=" + str(K) + " seed=4 : likelihood=" + str(cost4))
def run_em_netflix(): for K in [1, 12]: max_ll = None best_seed = None for seed in range(0, 5): mixture, post = common.init(X, K, seed) mixture, post, ll = em.run(X, mixture, post) if max_ll is None or ll > max_ll: max_ll = ll best_seed = seed title = "EM for K={}, seed={}, ll={}".format(K, best_seed, max_ll) print(title)
def run_matrix_completion(): K = 12 seed = 1 mixture, post = common.init(X, K, seed) (mu, var, p), post, ll = em.run(X, mixture, post) # print('Mu:\n' + str(mu)) # print('Var: ' + str(var)) # print('P: ' + str(p)) # print('post:\n' + str(post)) # print('LL: ' + str(ll)) X_pred = em.fill_matrix(X, common.GaussianMixture(mu, var, p)) X_gold = np.loadtxt('netflix_complete.txt') print("MAE:", common.mae(X_gold, X_pred))
def run_em(X, plot=False): max_bic = None for i in range(len(K)): max_ln_like = None best_seed = None for j in range(len(seed)): mixture, post = common.init(X, K[i], seed[j]) mixture, post, ln_like = em.run(X, mixture, post) if max_ln_like is None or ln_like > max_ln_like: max_ln_like = ln_like best_seed = seed[j] if plot: common.plot(X, mixture, post, "K={}, seed={}".format(K[i], seed[j])) mixture, post = common.init(X, K[i], best_seed) mixture, post, ln_like = em.run(X, mixture, post) bic = common.bic(X, mixture, ln_like) if max_bic is None or bic > max_bic: max_bic = bic print("K = {}, Max ln(likelihood) = {}, Best seed = {}, Max BIC = {}". format(K[i], max_ln_like, best_seed, max_bic))
def train(self): """ Train the model based on the provided data """ if self.verbose: print "# TRAINING model", self.label if self.means is None or self.cov is None: err = "Gaussian Mixture Model should be init before trained" raise Exception(err) params = EM.run(self.trainingData, self.means, self.cov, self.weights, self.K) self.means = params[0] self.cov = params[1] self.weights = params[2]
def run_EM_Netflix(): """Runs the EM algorithm on the incomplete data matrix from Netflix ratings """ for K in [1, 12]: max_ll = None best_seed = None for seed in range(5): mixture, post = common.init(X, K, seed) mixture, post, ll = em.run(X, mixture, post) if max_ll is None or ll > max_ll: max_ll = ll best_seed = seed title = "EM for K = {}, seed = {}, ll = {}".format( K, best_seed, max_ll) print(title)
import numpy as np import em import common X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("netflix_complete.txt") K = 12 log_lh = [0, 0, 0, 0, 0] best_seed = 0 mixtures = [0, 0, 0, 0, 0] posts = [0, 0, 0, 0, 0] rmse = 0. # Test all seeds for i in range(5): mixtures[i], posts[i], log_lh[i] = em.run(X, *common.init(X, K, i)) best_seed = np.argmax(log_lh) Y = em.fill_matrix(X, mixtures[best_seed]) rmse = common.rmse(X_gold, Y) print("RMSE for K = 12: {:.4f}".format(rmse))
em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post) if k_cost < k_best_cost: k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll BICs[i] = common.bic(X, em_best_mix, em_best_ll) common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K)) common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K)) print("BICs: ", BICs) print("Best BIC: ", np.max(BICs)) print("Best K: ", Ks[np.argmax(BICs)]) X = np.loadtxt("netflix_incomplete.txt") K = 12 seeds = [0, 1, 2, 3, 4] em_best_mix, em_best_post, em_best_ll = None, None, -np.inf for seed in seeds: init_mix, init_post = common.init(X, K, seed) em_mix, em_post, em_ll = em.run(X, init_mix, init_post) if em_ll > em_best_ll: em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll print("K = {}, LL = {}".format(K, em_best_ll)) X_fill_pred = em.fill_matrix(X, em_best_mix) X_fill = np.load("netflix_complete") print("X_filled Error:", common.rmse(X_fill_pred, X_fill))
mixtures = [0, 0, 0, 0, 0] # Posterior probs. for best seeds posts = [0, 0, 0, 0, 0] # RMS Error for clusters rmse = [0., 0.] start_time = time.perf_counter() for k in range(len(K)): for i in range(5): # Run EM mixtures[i], posts[i], log_lh[i] = \ em.run(X, *common.init(X, K[k], i)) # Print lowest cost print("=============== Clusters:", K[k], "======================") print("Highest log likelihood using EM is:", np.max(log_lh)) # # Save best seed for plotting best_seed[k] = np.argmax(log_lh) # # # Use the best mixture to fill prediction matrix X_pred = em.fill_matrix(X, mixtures[best_seed[k]]) rmse[k] = common.rmse(X_gold, X_pred) print("===================================================") print("RMS Error for K = 12 is: {:.4f}".format(rmse[1])) end_time = time.perf_counter()
import numpy as np import em import common X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") K = 4 n, d = X.shape seed = 0 # TODO: Your code here mix_conv, post_conv, log_lh_conv = em.run(X, *common.init(X, K, seed)) X_predict = em.fill_matrix(X, mix_conv) rmse = common.rmse(X_gold, X_predict) #%% Begin: Comparison of EM for matrix completion with K = 1 and 12 import time X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("netflix_complete.txt") K = [1, 12] # Clusters to try log_lh = [0, 0, 0, 0, 0] # Log likelihoods for different seeds # Best seed for cluster based on highest log likelihoods best_seed = [0, 0]
# # mixture, post, cost = naive_em.run(X, mixture, post) # # common.plot(X, mixture, post, f"EM for K={k}") # EM for collaborative filtering # X = np.loadtxt("netflix_incomplete.txt") # k = [1, 12] # best_seed = np.zeros(2, dtype=np.int) # for j in range(2): # best_cost = -np.inf # for seed in range(0, 5): # mixture, post = common.init(X, k[j], seed) # mixture, post, cost = em.run(X, mixture, post) # # cost = common.bic(X, mixture, cost) # if cost > best_cost: # best_cost = cost # best_seed[j] = seed # # import pdb; pdb.set_trace() # print(f'Cost at k = {k[j]} with seed = {best_seed} is {best_cost}') # print(f'Best Cost at k = {k[j]} with seed = {best_seed} is {best_cost}') # RMSE error with complete data seed = 1 k = 12 X = np.loadtxt("netflix_incomplete.txt") mixture, post = common.init(X, k, seed) mixture, post, cost = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) X_gold = np.loadtxt("netflix_complete.txt") rmse = common.rmse(X_pred, X_gold) print(f'RMSE = {rmse}')
import numpy as np import common import em from scipy.special import logsumexp ### Collborative filtering with EM X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") X_test = np.loadtxt("test_incomplete.txt") X_experiment = np.loadtxt("toy_data.txt") mixture, post = common.init(X, K=12, seed=1) mixture, post, loglike = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) print(common.rmse(X_gold, X_pred)) print(mixture) #print(em.fill_matrix(X_test ### get the best seed and the best k size that minimizes the cost ## Best seed # Get the lowest cost #optimal_seed_cost = em_total_likelihood_dict[0] #for k, v in em_total_likelihood_dict.items(): # if v > optimal_seed_cost: # optimal_seed_cost = v # else:
def run_matrix_completion(): mixture, post = common.init(X, 12, 1) mixture, post, ll = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) X_gold = np.loadtxt('netflix_complete.txt') print("root mean squared error:", common.rmse(X_gold, X_pred))
import numpy as np import em import common X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") K = 4 n, d = X.shape seed = 0 mixture, post = common.init(X, K, seed) mixture, post, ln_like = em.run(X, mixture, post) print(mixture)
# Reporting log likelihood values on Netflix data # ============================================================================= X = np.loadtxt("netflix_incomplete.txt") mixture, post = common.init(X, K=1, seed=0) post, log_likelihood = em.estep(X, mixture) mixtured = em.mstep(X, post, mixture) Ks = [1, 12] seeds = [0, 1, 2, 3, 4] for K in Ks: for seed in seeds: mixture, post = common.init(X, K=K, seed=seed) mixture, post, log_likelihood = em.run(X, mixture, post) print(K, seed, log_likelihood) # ============================================================================= # Completing missing entries # ============================================================================= X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") mixture, post = common.init(X, K=4, seed=0) mixture, post, log_likelihood = em.run(X, mixture, post) X_pred = em.fill_matrix(X, mixture) RMSE = common.rmse(X_gold, X_pred) print(X_pred, RMSE)
# print("After first E-step:") post, ll = em.estep(X, mixture) # print('post:\n' + str(post)) # print('LL:' + str(ll)) # print() # print("After first M-step:") mu, var, p = em.mstep(X, post, mixture) # print('Mu:\n' + str(mu)) # print('Var: ' + str(var)) # print('P: ' + str(p)) # print() # print("After a run") (mu, var, p), post, ll = em.run(X, mixture, post) # print('Mu:\n' + str(mu)) # print('Var: ' + str(var)) # print('P: ' + str(p)) # print('post:\n' + str(post)) # print('LL: ' + str(ll)) X_pred = em.fill_matrix(X, common.GaussianMixture(mu, var, p)) # error = common.rmse(X_gold, X_pred) # print("X_gold:\n" + str(X_gold)) # X_pred = np.round(X_pred) fil = open( '/home/animesh/WTA/movie_recommendation/recommender/trainer/test_file.txt', 'w') fil.write(str(n) + ' ' + str(d) + '\n') for i in X_pred: for j in i:
cost_min = np.min([cost_min, cost]) print("K =", K+1, " cost =", cost_min) print() print("E-M") """ best_K = None best_bic = float('-inf') for K in [0, 11]: ll_max = float('-inf') best_seed = None best_mixture = None for seed in range(5): mixture, post = common.init(X, K + 1, seed) mixture, post, ll = em.run(X, mixture, post) full_matrix = em.fill_matrix(X, mixture) #common.plot(X, mixture, post, "E-M, K="+str(K)+" seed="+str(seed)) if ll > ll_max: best_seed = seed ll_max = ll best_mixture = mixture """ bic = common.bic(X, best_mixture, ll_max) if bic > best_bic: best_K = K+1 best_bic = bic """ print("K =", K + 1, " LL =", ll) #print("full_matrix =") #print(full_matrix[4,:])
import em import common X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") K = 12 n, d = X.shape seed = 0 # TODO: Your code here loglikelihoods = [] #bics=[] for k in [1, 12]: log_likelihood_ = [] for seed in range(5): # bic_=[] #for seed in range(4): gauss_mixture, post = common.init(X=X, K=k, seed=seed) #print('for k =',k, "and seed=",seed, end=" ") #print("cost=",cost) gauss_mixture_em, post_em, loglikelihood = em.run( X, gauss_mixture, post) # bic_.append(common.bic(X,gauss_mixture_em,loglikelihood)) log_likelihood_.append(loglikelihood) # bics.append(bic_) loglikelihoods.append(log_likelihood_)
import numpy as np import em import common # X = np.loadtxt("test_incomplete.txt") X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt("netflix_complete.txt") K = 12 n, d = X.shape # seeds = [0,1,2,3,4] seeds = [1] for seed in seeds: mixture, post = common.init(X, K, seed) # kmixture, kpost, kcost = kmeans.run(X, mixture, post) # title = f"K is {K}, seed is {seed}, cost is {kcost}" em_mixture, em_post, em_cost = em.run(X, mixture, post) X_pred = em.fill_matrix(X, em_mixture) rmse = common.rmse(X_gold, X_pred) print(f'RMSE is {rmse}') # with_bic = common.bic(X, em_mixture, em_cost) title = f"K is {K}, seed is {seed}, em_cost is {em_cost}" print(title) # common.plot(X, em_mixture, em_post, title) # TODO: Your code here
import em import common X = np.loadtxt("test_incomplete.txt") X_gold = np.loadtxt("test_complete.txt") X_gold_netflix = np.loadtxt("netflix_complete.txt") X_netflix =np.loadtxt("netflix_incomplete.txt") K = 12 n, d = X.shape seed = [0,1,2,3,4] # TODO: Your code here for i in range(len(seed)): print(seed[i]) init_model = common.init(X_netflix, K, seed[i]) mixture, post, cost = em.run(X_netflix, init_model[0], init_model[1]) X_pred = em.fill_matrix(X_netflix, mixture) rmse = common.rmse(X_gold_netflix,X_pred) print(cost) print(rmse) # K= 4 # n,d = X.shape # seed =0 # init_model = common.init(X, K, seed) # mixture, post, cost = em.run(X, init_model[0], init_model[1]) # # print(mixture) # X_pred = em.fill_matrix(X,mixture) # print(X_pred)
seeds = [0, 1, 2, 3, 4] K = [1, 12] bic = np.zeros(len(K)) for j, k in enumerate(K): mixtures = [] posts = [] logloss = np.empty(len(seeds)) for i, seed in enumerate(seeds): # initialize mixture model with random points mixture, post = common.init(X, K=k, seed=seed) # run EM-algorithm mixture, post, LL = em.run(X, mixture=mixture, post=post) mixtures.append(mixture) posts.append(post) logloss[i] = LL print('K=', k, 'seed=', seed, 'logloss=', LL) best_seed = np.argmax(logloss) logloss = logloss[best_seed] mixture = mixtures[best_seed] post = posts[best_seed] current_bic = common.bic(X, mixture, logloss) bic[j] = current_bic print(f'K={k}', f'Best seed={best_seed}', f'logloss={logloss}', f'BIC={current_bic}')
# gaussian, post, new_ll = kmeans.run(X, gaussian, post) # common.plot(X, gaussian, post, "K-means: number of classes{}, random seed {}".format(k, i)) # # for k in range(1, 5, 1): # for i in range(1): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = naive_em.run(X, gaussian, post) # common.plot(X, gaussian, post, "EM: number of classes{}, random seed {}".format(k, i)) X = np.loadtxt("netflix_incomplete.txt") X_gold = np.loadtxt('netflix_complete.txt') # for k in [1, 12]: # for i in range(5): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = em.run(X, gaussian, post) # print("EM: number of classes {}, random seed {}:".format(k, i)) # print(new_ll) gaussian, post = common.init(X, 12, seed=1) gaussian, post, new_ll = em.run(X, gaussian, post) X_pred = em.fill_matrix(X, gaussian) print(common.rmse(X_gold, X_pred)) # for k in range(1, 5, 1): # for i in range(5): # gaussian, post = common.init(X, k, seed=i) # gaussian, post, new_ll = naive_em.run(X, gaussian, post) # print("BIC = {} for K = {} and seed = {}".format(common.bic(X, gaussian, new_ll), k, i)) # #