def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") Ks = [10,20,30,50,100] reg = 0.0 eta = 0.03 # learning rate E_in = [] E_out = [] # Use to compute Ein and Eout for K in Ks: U,V, err = train_model(M, N, K, eta, reg, Y_train) E_in.append(err) E_out.append(get_err(U, V, Y_test)) plt.plot(Ks, E_in, label='$E_{in}$') plt.plot(Ks, E_out, label='$E_{out}$') plt.title('Error vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2d.png')
def cross_validate(Y_train, Y_test, regs, etas): ''' cross validates the model, varying regularization strength and step size. ''' print('training size =', len(Y_train)) print('testing size =', len(Y_test)) print() for reg in regs: for eta in etas: U, V, a, b, _ = train(Y_train, reg, eta, Y_test=Y_test, zero_mean=False, save=False) errIn = get_err(U, V, a, b, Y_train, reg=0) errOut = get_err(U, V, a, b, Y_test, reg=0) output_str = '' output_str = '{}, errOut = {:.6f}'.format(output_str, errOut) output_str = '{}, reg = {:.5f}'.format(output_str, reg) output_str = '{}, eta = {:.4f}'.format(output_str, eta) output_str = '{}, errIn = {:.6f}'.format(output_str, errIn) print(output_str[2:])
def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies k = 20 #regularization constants regs = [10**-4, 10**-3, 10**-2, 10**-1, 1] #learning rate eta = 0.01 #0.00005 best epsilons = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.002] E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for ep in epsilons: print( "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s, ep = %s" % (M, N, k, eta, reg, ep)) U, V, e_in = train_model(M, N, k, eta, reg, Y_train, ep) E_ins_for_lambda.append(e_in) eout = get_err(U, V, Y_test) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) for i in range(len(regs)): plt.plot(epsilons, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i])) plt.title('$E_{in}$ vs. Epsilon') plt.xlabel('Epsilon') plt.ylabel('Error') plt.xscale('log') plt.legend() plt.savefig('E_in.png') plt.clf() for i in range(len(regs)): plt.plot(epsilons, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i])) plt.title('$E_{out}$ vs. Epsilon') plt.xlabel('Epsilon') plt.ylabel('Error') plt.xscale('log') plt.legend() plt.savefig('E_out.png')
def main(): Y_train = np.loadtxt('train.txt').astype(int) Y_test = np.loadtxt('test.txt').astype(int) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies Ks = [10, 20, 30, 50, 100] regs = [10**-4, 10**-3, 10**-2, 10**-1, 1] #regs = [10**-4] eta = 0.03 # learning rate E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for k in Ks: print( "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s" % (M, N, k, eta, reg)) U, V, e_in = train_model(M, N, k, eta, reg, Y_train) E_ins_for_lambda.append(e_in) eout = get_err(U, V, Y_test) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) # Plot values of E_in across k for each value of lambda for i in range(len(regs)): plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i])) plt.title('$E_{in}$ vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2e_ein.png') plt.clf() # Plot values of E_out across k for each value of lambda for i in range(len(regs)): plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i])) plt.title('$E_{out}$ vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2e_eout.png')
def RunModel1(M, N, k, eta, reg, Y_train, Y_test, GraphFlag=True): print("Training model 1 with M = %s, N = %s, k = %s, eta = %s, reg = %s" % (M, N, k, eta, reg)) U_1, V_1, e_in_1 = model_1.train_model(M, N, k, eta, reg, Y_train) e_out_1 = model_1.get_err(U_1, V_1, Y_test) print("model 1 results: e_in = %.3f, e_out = %.3f" % (e_in_1, e_out_1)) if GraphFlag is False: return e_in_1, e_out_1 # Transform model 1 to 2D U_proj_1, V_proj_1 = project_to_2D(U_1, V_1) # Plot model 1 for ids, category in to_plot: visualize(V_proj_1, ids, 'Model 1: ' + category) return e_in_1, e_out_1
def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") Ks = [20] reg = 0.1 eta = 0.01 # learning rate epsilon = 0.00005 E_in = [] E_out = [] # Use to compute Ein and Eout for K in Ks: U, V, a, b, err = train_model(M, N, K, eta, reg, Y_train, epsilon) # E_in.append(err) # E_out.append(get_err(U, V, a, b, Y_test)) print(get_err(U, V, a, b, Y_test))
def main(): #movie_info = np.genfromtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16)) movie_info = np.loadtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16)) data = np.loadtxt('../data/data.txt').astype(int) Y_train = np.loadtxt('../data/train.txt').astype(int) Y_test = np.loadtxt('../data/test.txt').astype(int) print(movie_info) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") reg = 0.0 eta = 0.03 # learning rate k = 20 E_in = [] E_out = [] # Use to compute Ein and Eout U, V, err = train_model(M, N, k, eta, reg, Y_train) e_out = get_err(U, V, Y_test) print("e_in", err) print("e_out", e_out) #model.score(Y_test) a, sigma, b = np.linalg.svd(V) print(V.shape, a.shape) a_t = a #np.transpose(a) #movie ID starts at 1, but matrix starts at 0 v_proj = np.transpose(np.dot(a_t[:2], V)) x = [] y = [] for i in v_proj: x.append(i[0]) y.append(i[1]) ratings = {} for user, movie_id, rating in data: if movie_id in ratings: ratings[movie_id].append(rating) else: ratings[movie_id] = [rating] #x = v_proj[0] #y = v_proj[1] #print(x) print(v_proj.shape) # Setup ids = movie_info[:, 0].astype(int) movie_names = movie_info[:, 1] # 1. 10 movies of our choice from the MovieLens dataset plt.scatter(x[2:12], y[2:12]) texts = [] for j, txt in enumerate(movie_names[2:12]): texts.append(plt.text(x[2:12][j], y[2:12][j], txt)) adjust_text(texts) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Movies of Our Choice') plt.savefig('Standard-choice.png') plt.clf() # 2. All ratings of the ten most popular movies max_10 = dict( sorted(ratings.items(), key=lambda r: len(r[1]), reverse=True)[:10]) x_pop = [] y_pop = [] top_ratings = [] top_ratings = max_10.keys() movie_title = [] print(top_ratings) counter = 0 for i in v_proj: counter += 1 if counter in top_ratings: x_pop.append(i[0]) y_pop.append(i[1]) movie_title.append(movie_names[counter]) print(movie_title) plt.scatter(x_pop, y_pop) texts = [] for j, txt in enumerate(movie_title): texts.append(plt.text(x_pop[j], y_pop[j], txt)) #plt.annotate(txt, (x_pop[j], y_pop[j])) adjust_text(texts) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Most Popular Movies') plt.savefig('Standard-popular.png') plt.clf() # 3. All ratings of the ten best movies best_10 = dict( sorted(ratings.items(), key=lambda r: sum(r[1]) / len(r[1]), reverse=True)[:10]) x_best = [] y_best = [] best = [] best = best_10.keys() print(best) count = 0 for i in v_proj: count += 1 if count in best: x_best.append(i[0]) y_best.append(i[1]) for j, txt in enumerate(movie_title): plt.annotate(txt, (x_best[j], y_best[j])) plt.scatter(x_best, y_best) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Best Movies') plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.savefig('Standard-best.png') plt.clf() # 4. 10 ratings of movies from three genres of your choice ids = movie_info[:, 0].astype(int) movie_names = movie_info[:, 1] action = (movie_info[:, 2].astype(int)) action_movies = dict((k, v) for k, v in zip(ids, action) if v == 1) action_ratings_dict = dict((k, ratings[k]) for k in action_movies.keys()) x_action = [] y_action = [] action_ratings = [] action_ratings = action_ratings_dict.keys() comedy = movie_info[:, 3].astype(int) comedy_movies = dict((k, v) for k, v in zip(ids, comedy) if v == 1) comedy_ratings_dict = dict((k, ratings[k]) for k in comedy_movies.keys()) x_comedy = [] y_comedy = [] comedy_ratings = [] comedy_ratings = comedy_ratings_dict.keys() romance = movie_info[:, 4].astype(int) romance_movies = dict((k, v) for k, v in zip(ids, romance) if v == 1) romance_ratings_dict = dict((k, ratings[k]) for k in romance_movies.keys()) x_romance = [] y_romance = [] romance_ratings = [] romance_ratings = romance_ratings_dict.keys() count = 0 for i in v_proj: count += 1 if count in action_ratings: x_action.append(i[0]) y_action.append(i[1]) if count in comedy_ratings: x_comedy.append(i[0]) y_comedy.append(i[1]) if count in romance_ratings: x_romance.append(i[0]) y_romance.append(i[1]) plt.scatter(x_action[2:12], y_action[2:12], label="Action") plt.scatter(x_comedy[2:12], y_comedy[2:12], color='orange', label="Comedy") plt.scatter(x_romance[2:12], y_romance[2:12], color='green', label="Romance") plt.legend() plt.title("Three Genres") plt.savefig('Standard-genres.png') plt.clf()