def worker(fold, n_users, n_items, dataset_dir): traFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tra.txt' trasR = loadSparseR(n_users, n_items, traFilePath) print( dataset_dir.split('/')[-2] + ':', trasR.shape, trasR.nnz, '%.2f' % (trasR.nnz / float(trasR.shape[0]))) tra_tuple = np.array([(user, item, trasR[user, item]) for user, item in np.asarray(trasR.nonzero()).T ]) # triad tstFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tst.txt' tstsR = loadSparseR(n_users, n_items, tstFilePath) tst_tuple = np.array([(user, item, tstsR[user, item]) for user, item in np.asarray(tstsR.nonzero()).T ]) # triad sampler = Sampler(trasR=trasR, negRatio=.0, batch_size=batch_size) mf = MF(n_users, n_items, eval_metrics, range_of_ratings, reg, n_factors, batch_size) scores = mf.train(fold + 1, tra_tuple, tst_tuple, sampler) print('fold=%d:' % fold, ','.join(['%s' % eval_metric for eval_metric in eval_metrics]), '=', ','.join(['%.6f' % (score) for score in scores])) return scores
def train(): print('Preprocessing raw data') preprocessor = Preprocessor() preprocessor.preprocess() dataset = Dataset(preprocessor) print('Training MF') mf = MF(preprocessor, dataset) mf.train_or_load_if_exists() print('Building I2I') i2i = Item2Item(dataset) print('Generating candidates') candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train() X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val() import pickle try: with open('puke.pkl', 'wb') as f: pickle.dump((X_train, y_train, q_train, q_train_reader, X_val, y_val, q_val, q_val_reader), f) except: print("Couldn't save puke") print('Training ranker') ranker = Ranker() ranker.train(X_train, y_train, q_train, X_val, y_val, q_val) ranker.save() print('Validating ranker') rank_scores = ranker.rank(X_val) print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
def inference(): preprocessor = Preprocessor(first_time=False) preprocessor.preprocess() dataset = Dataset(preprocessor) mf = MF(preprocessor, dataset) mf.load() i2i = Item2Item(dataset) candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) ranker = Ranker() ranker.load() X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit() try: with open('submit_puke.pkl', 'wb') as f: pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f) except: print("Couldn't save submit_puke") # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb')) rank_scores = ranker.rank(X_submit) base = 0 entire_articles = [] not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items) not_heavy_items = sorted(not_heavy_items) cut = 50 random.seed(0) with result_path.open('w') as fout: for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)): articles = X_article_nums[base:base+group_size] scores = rank_scores[base:base+group_size] articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)] articles = articles[:cut] from_followable = candidate_generator.get_readers_followable_articles(reader) # from_keywords = candidate_generator.get_readers_keyword_articles(reader) for item in from_followable: if len(articles) >= cut + 15: break if item in articles: continue articles.append(item) while len(articles) < 100: item = random.choice(not_heavy_items) if item not in articles: articles.append(item) entire_articles.extend(articles) reader_str = preprocessor.num2reader[reader] article_strs = map(preprocessor.num2article.get, articles) fout.write('%s %s\n' % (reader_str, ' '.join(article_strs))) base += group_size print('Entropy of candidates = ', entropy(entire_articles))
def train(reg): logdir = 'logs/mf/numpy' if not os.path.exists(logdir): os.makedirs(logdir) print("Loading data...") movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(n_samples=1000000) print("number of users with ratings: {}".format(len(np.unique(rating_indices[:,0])))) print("number of movies with ratings: {}".format(len(np.unique(rating_indices[:,1])))) method = 'als' if reg: print("Performing cross validation with reg: {}.".format(reg)) else: print("Finding optimal regularization penalty.") reg_vals = [0.01, 0.1, 1, 10] best_reg = 0 mean_loss = 0.0 n_splits = 5 n_features = 15 loss_path = np.zeros((len(reg_vals), n_splits)) kf = KFold(n_splits=n_splits, shuffle=True) kf.get_n_splits(rating_indices) for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): print("Fold {}".format(k)) train_indices, test_indices = rating_indices[train_index], rating_indices[test_index] train_indices = (train_indices[:,0], train_indices[:,1], train_indices[:,2]) test_indices = (test_indices[:,0], test_indices[:,1], test_indices[:,2]) if reg: start = time.time() model = MF(n_users, n_items, n_features, method=method) model.fit(train_indices, verbose=1) acc, loss = model.predict(test_indices) print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc)) mean_loss = (mean_loss*k + loss) / (k+1) else: for i, reg in enumerate(reg_vals): print("lambda: {}".format(reg)) start = time.time() model = MF(n_users, n_items, n_features, method=method) model.fit(train_indices, verbose=1) acc, loss = model.predict(test_indices) print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc)) loss_path[i, k] = loss if reg: print("mean loss: {:.4f}".format(mean_loss)) else: loss_means = np.mean(loss_path, axis=1) print(loss_means) best_reg = reg_vals[np.argmin(loss_means)] best_loss = np.amin(loss_means) print("best lambda: {} - loss: {}".format(best_reg, best_loss)) print("Successfully finished training MF. See logs directory.")
def train_and_save(data, save_to='db'): start = time.time() print("> Training the NMF model over", data.shape, "items") mf = MF(data, K=20, alpha=0.001, beta=0.01, iterations=800) mf.train() saved_model = mf.full_matrix() end = time.time() print("> Elapsed Time to Train = ", end - start) if save_to == 'pickle': np.save('NMF', saved_model) if save_to == 'db': savetodb(saved_model) return 0
def matrix_factorization(): prefix = 'Data/' # ------------------------------- Learning ------------------------------- # # Load training data training_user_movie_pairs = base.load_from_csv( os.path.join(prefix, 'data_train.csv')) training_labels = base.load_from_csv( os.path.join(prefix, 'output_train.csv')) # Concatenating data user_movie_rating_triplets = np.hstack( (training_user_movie_pairs, training_labels.reshape((-1, 1)))) # Build the learning matrix rating_matrix = base.build_rating_matrix(user_movie_rating_triplets) # Build the model model = MF(rating_matrix, K=30, alpha=1e-5, beta=0.02, iterations=2000) with base.measure_time('Training'): print('Training matrix factorization...') model.train() # Save the predicted matrix predicted_matrix = np.matrix(model.full_matrix()) with open('predicted_matrix.txt', 'wb') as f: for line in predicted_matrix: np.savetxt(f, line, fmt='%.5f') # -----------------------Submission: Running model on provided test_set---------------------------- # df = pd.read_csv("Data/data_test.csv") R = pd.read_csv('predicted_matrix.txt', sep=" ", header=None) R = R.values users = df['user_id'].values movies = df['movie_id'].values ratings = [] for u, m in zip(users, movies): if (R[u - 1][m - 1] > 5.00): ratings.append(5.00) else: ratings.append(R[u - 1][m - 1]) fname = base.make_submission(ratings, df.values.squeeze(), 'MatrixFactorization') print('Submission file "{}" successfully written'.format(fname))
def __init__(self): # load data print 'start load data' data = pd.read_csv('data/movielens/user_ratedmovies.dat', delimiter='\t') item_info = pd.read_csv('data/movielens/movies.dat', delimiter='\t') self.itemid2name = dict(zip(item_info['id'].tolist(), item_info['title'].tolist())) N = len(set(data['userID'].tolist())) # number of user M = len(set(data['movieID'].tolist())) # number of movie rating_matrix = np.zeros([N, M]) userid2index = {} itemid2index = {} userid2itemindexes = {} for i, row in data.iterrows(): userid = row['userID'] itemid = row['movieID'] rating = row['rating'] # print userid, itemid, rating if userid in userid2index: userindex = userid2index[userid] userid2itemindexes[userid].append(itemid) else: userindex = len(userid2index) userid2index[userid] = userindex userid2itemindexes[userid] = [itemid] if itemid in itemid2index: itemindex = itemid2index[itemid] else: itemindex = len(itemid2index) itemid2index[itemid] = itemindex rating_matrix[userindex, itemindex] = rating self.userid2itemindexes = userid2itemindexes self.userid2index = userid2index self.itemid2index = itemid2index self.index2userid = {y: x for x, y in userid2index.items()} self.index2itemid = {y: x for x, y in itemid2index.items()} nonzero_col, nonzero_row = rating_matrix.nonzero() inds = zip(nonzero_col.tolist(), nonzero_row.tolist()) print 'finish load data' K = 10 alpha = 0.0001 lam = 0.01 self.mf = MF(rating_matrix, inds, K, alpha, lam) self.is_training = False self.losses = [] self.epochs = []
def doTrain(K, alpha, beta, gamma, iterations, maxError): print('>>> K=' + str(K) + ', alpha=' + str(alpha) + ', beta=' + str(beta) + ', gamma=' + str(gamma) + ', iterations=' + str(iterations) + ', maxError=' + str(maxError)) inCsv = 'ml-latest-small/ratings.csv' inTestCsv = 'ml-latest-small/trainRatings.csv' outModel = 'trainedModel.pkl' ratings = readCsv(inCsv) trainSubset = readCsv(inTestCsv) maxUserId, maxMovieId = getMaxIds(ratings) R = getRatingsMatrix(trainSubset, maxUserId, maxMovieId) mf = MF(R, K, alpha, beta, gamma, iterations, maxError) print('Training...') training_process = mf.train() print('Done. Mse = ' + str(mf.get_mse())) print('Serializing model to ' + outModel) with open(outModel, 'wb') as output: pickle.dump(mf, output, pickle.HIGHEST_PROTOCOL) print('Done serializing model to ' + outModel)
def run(path_str, comb='', K=10): if path_str in ['ratings_only']: use_topK = False else: use_topK = True sim_filename = dir_ + 'sim_res/path_count/%s.res' % path_str if path_str == 'ratings_only': sim_filename = dir_ + 'ratings.txt' if use_topK: sim_filename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, topK) if comb: sim_filename = dir_ + 'sim_res/path_count/combs/%s_%s_top%s.res' % ( path_str, comb, topK) start_time = time.time() data = np.loadtxt(sim_filename) uids = set(data[:, 0].flatten()) bids = set(data[:, 1].flatten()) uid2ind = {int(v): k for k, v in enumerate(uids)} ind2uid = reverse_map(uid2ind) bid2ind = {int(v): k for k, v in enumerate(bids)} ind2bid = reverse_map(bid2ind) data[:, 0] = [uid2ind[int(r)] for r in data[:, 0]] data[:, 1] = [bid2ind[int(r)] for r in data[:, 1]] print 'finish load data from %s, cost %.2f seconds, users: %s, items=%s' % ( sim_filename, time.time() - start_time, len(uids), len(bids)) eps, lamb, iters = 10, 10, 500 print 'start generate mf features, (K, eps, reg, iters) = (%s, %s, %s, %s)' % ( K, eps, lamb, iters) mf = MF(data=data, train_data=data, test_data=[], K=K, eps=eps, lamb=lamb, max_iter=iters, call_logger=logger) U, V = mf.run() start_time = time.time() wfilename = dir_ + 'mf_features/path_count/%s_user.dat' % (path_str) rank_dir = dir_ + 'mf_features/path_count/ranks/%s/' % K if K != 10 and not os.path.isdir(rank_dir): os.makedirs(rank_dir) if use_topK: #wfilename = dir_ + 'mf_features/path_count/%s_top%s_user.dat' % (path_str, topK) wfilename = dir_ + 'mf_features/path_count/%s_top%s_user.dat' % ( path_str, topK) else: wfilename = dir_ + 'mf_features/path_count/%s_user.dat' % (path_str) fw = open(wfilename, 'w+') res = [] for ind, fs in enumerate(U): row = [] row.append(ind2uid[ind]) row.extend(fs.flatten()) res.append('\t'.join([str(t) for t in row])) fw.write('\n'.join(res)) fw.close() print 'User-Features: %s saved in %s, cost %.2f seconds' % ( U.shape, wfilename, time.time() - start_time) start_time = time.time() wfilename = dir_ + 'mf_features/path_count/%s_item.dat' % (path_str) if use_topK: #wfilename = dir_ + 'mf_features/path_count/%s_top%s_item.dat' % (path_str, topK) wfilename = dir_ + 'mf_features/path_count/%s_top%s_item.dat' % ( path_str, topK) else: wfilename = dir_ + 'mf_features/path_count/%s_item.dat' % (path_str) fw = open(wfilename, 'w+') res = [] for ind, fs in enumerate(V): row = [] row.append(ind2bid[ind]) row.extend(fs.flatten()) res.append('\t'.join([str(t) for t in row])) fw.write('\n'.join(res)) fw.close() print 'Item-Features: %s saved in %s, cost %.2f seconds' % ( V.shape, wfilename, time.time() - start_time)
import numpy as np from mf import MF # A rating matrix with ratings from 5 users on 4 items # zero entries are unknown values R = np.array([ [5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4], ]) # Perform training and obtain the user and item matrices mf = MF(R, K=2, alpha=0.1, beta=0.01, iterations=20) training_process = mf.train() print(mf.P) print(mf.Q) print(mf.full_matrix())
sep='::', engine='python', encoding='latin-1', names=['user_id', 'gender', 'age', 'occupation', 'zipcode']) # users['age_desc'] = users['age'].apply(lambda x: AGES[x]) # users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x]) # print (len(users), 'descriptions of', max_userid, 'users loaded.') user_train = users.as_matrix() all_users = user_train[:, :4] all_users[all_users[:,1]=='M', 1] = 1 all_users[all_users[:,1]=='F', 1] = 0 print(all_users) rs = MF(rate_train, K = 100, lam = .1, print_every = 10,learning_rate = 0.75, max_iter = 100, user_based = 1) # print("X0:\n", rs.X) # print("rate_test:\n", rate_test) # in_file = open("MF.obj", "rb") # opening for [r]eading as [b]inary # rs = pickle.load(in_file) # if you only wanted to read 512 bytes, do .read(512) # in_file.close() # print(type(rs)) rs.fit() file_mf = open('MF_1m.obj', 'wb') pickle.dump(rs, file_mf) file_mf.close() print(type(rs)) # print("X1:\n", rs.X) print("utility:\n", rs.X.dot(rs.W) + rs.mu)
R1 = np.array(acq_data) #Set the number of values to replace. For example 20%: prop = int(R.size * 0.2) #Randomly choose indices of the numpy array: i = [np.random.choice(range(R.shape[0])) for _ in range(prop)] j = [np.random.choice(range(R.shape[1])) for _ in range(prop)] #Change values with 0 R[i, j] = 0 print("Original:\n", R1) print("Test Set:\n", R) R = np.rint(R) from sklearn.metrics import mean_squared_error mse = mean_squared_error(R, R1) print("RMSE=", mse**0.5) print("\nTraining ...\n") mf = MF(R, K=2, alpha=0.01, beta=0.01, iterations=100) training_process = mf.train() L = np.rint(mf.full_matrix()) print("\nDone\n") x = [x for x, y in training_process] y = [y for x, y in training_process] x = x[::10] y = y[::10] plt.figure(figsize=((16, 4))) plt.plot(x, np.sqrt(y)) plt.xticks(x, x) print("Minimizing Error on Training Set:\n") plt.xlabel("Iterations") plt.ylabel("Root Mean Square Error") plt.grid(axis="y") print("Learnt=\n", mf.full_matrix())
rating_matrix[userindex, itemindex] = rating index2userid = {y: x for x, y in userid2index.items()} index2itemid = {y: x for x, y in itemid2index.items()} nonzero_row, nonzero_col = rating_matrix.nonzero() inds = zip(nonzero_row.tolist(), nonzero_col.tolist()) import sys sys.path.append('../tpmrec/') from mf import MF mf = MF(rating_matrix, inds, 10, 0.0001, 0.01) mf.train(10) for userindex in range(1000): userid = index2userid[userindex] if len(userid2itemindexes[userid]) > 20: continue pr = mf.predict() user_predict = pr[userindex, :] top_item_indexes = np.argsort(user_predict)[::-1][:10] print "userid = ", userid for itemid in userid2itemindexes[userid]: print itemid, itemid2name[itemid] print "recommend item" for itemindex in top_item_indexes:
import pandas as pd import numpy as np from mf import MF df_train = pd.read_csv('all/train.csv') df_train = df_train[0:10000] R = np.array( df_train.pivot(index='User', columns='Track', values='Rating').fillna(0)) d_mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100) training_process = d_mf.train() print() print("P x Q:") print(d_mf.full_matrix()) print()
self.P = np.vstack([self.P, new_lf]) print('in MF') print(len(self.P)) def remove_new_user(self): self.P = np.delete(self.P, 168, 0) def predict(self): prediction_mat = np.matmul(self.P, self.Q.T) return prediction_mat rating_data = np.load('rating_data.npy') mf = MF(rating_data, rating_data) _, _ = mf.train(epoch=20, verbose=False) num_total_rest = 80 restaurants = pickle.load(open('restaurants.dict', 'rb')) @app.route('/get_restaurants') def get_restaurants(): num_sample_rest = 5 rand_ints = [] while len(rand_ints) < 5: rand_int = randrange(num_total_rest) if not rand_int in rand_ints: rand_ints.append(rand_int)
def run(path_str, K=10): if path_str in ['ratings_only']: use_topK = False else: use_topK = True sim_filename = os.path.join(data_dir, 'sim_res/path_count/%s.res' % path_str) if path_str == 'ratings_only': sim_filename = os.path.join(data_dir, 'tuples/ratings.txt') elif use_topK: sim_filename = os.path.join( data_dir, 'sim_res/path_count/%s_top%s.res' % (path_str, topK)) start_time = time.time() data = np.loadtxt(sim_filename, dtype=np.str, delimiter="\t") uids = set(data[:, 0].flatten()) bids = set(data[:, 1].flatten()) # uid2ind = {v: k for k, v in enumerate(uids)} uid2ind = {int(v): k for k, v in enumerate(uids)} ind2uid = reverse_map(uid2ind) # bid2ind = {v: k for k, v in enumerate(bids)} bid2ind = {int(v): k for k, v in enumerate(bids)} ind2bid = reverse_map(bid2ind) data[:, 0] = [uid2ind[int(r)] for r in data[:, 0]] data[:, 1] = [bid2ind[int(r)] for r in data[:, 1]] # data[:, 0] = [uid2ind[r] for r in data[:, 0]] # data[:, 1] = [bid2ind[r] for r in data[:, 1]] print('finish load data from %s, cost %.2f seconds, users: %s, items=%s' % (sim_filename, time.time() - start_time, len(uids), len(bids))) # must convert data type to float data = data.astype(dtype=np.float) print("data shape: ", data.shape, data.dtype) eps, lamb, iters = 10, 10, 500 print( 'start generate mf features, (K, eps, reg, iters) = (%s, %s, %s, %s)' % (K, eps, lamb, iters)) mf = MF(data=data, train_data=data, test_data=[], K=K, eps=eps, lamb=lamb, max_iter=iters, call_logger=logger) U, V = mf.run() start_time = time.time() wfilename = os.path.join(data_dir, 'mf_features/path_count/%s_user.dat' % (path_str)) if use_topK: wfilename = os.path.join( data_dir, 'mf_features/path_count/%s_top%s_user.dat' % (path_str, topK)) fw = open(wfilename, 'w+') res = [] for ind, fs in enumerate(U): row = [] row.append(ind2uid[ind]) row.extend(fs.flatten()) res.append('\t'.join([str(t) for t in row])) fw.write('\n'.join(res)) fw.close() print('User-Features: %s saved in %s, cost %.2f seconds' % (U.shape, wfilename, time.time() - start_time)) start_time = time.time() wfilename = os.path.join(data_dir, 'mf_features/path_count/%s_item.dat' % (path_str)) if use_topK: wfilename = os.path.join( data_dir, 'mf_features/path_count/%s_top%s_item.dat' % (path_str, topK)) fw = open(wfilename, 'w+') res = [] for ind, fs in enumerate(V): row = [] row.append(ind2bid[ind]) row.extend(fs.flatten()) res.append('\t'.join([str(t) for t in row])) fw.write('\n'.join(res)) fw.close() print('Item-Features: %s saved in %s, cost %.2f seconds' % (V.shape, wfilename, time.time() - start_time))
[1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4], ]) #Set the number of values to replace. For example 20%: prop = int(R.size * 0.2) #Randomly choose indices of the numpy array: i = [np.random.choice(range(R.shape[0])) for _ in range(prop)] j = [np.random.choice(range(R.shape[1])) for _ in range(prop)] #Change values with 0 R[i, j] = 0 print("Original:\n", R1) print("Test Set:\n", R) R = np.rint(R) from sklearn.metrics import mean_squared_error mse = mean_squared_error(R, R1) print("MSE=", mse**0.5) print("\nTraining ...\n") mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=10000) training_process = mf.train() L = np.rint(mf.full_matrix()) print("Learnt=\n", L) print("\nFinding Error on test set...\n") msef = 0.0 for i1 in range(len(i)): for i2 in range(len(j)): if R1.item(i[i1], j[i2]) != 0: msef = msef + (R1.item((i[i1], j[i2])) - (L).item( (i[i1], j[i2])))**2 msef = (msef / (len(j) * len(i))) print("RMSE f=", msef**0.5)
import pickle from mf import MF from recommender.utils.dataprocess import ratings, meanRatings userCount = ratings['userId'].max() movieCount = ratings['movieIndex'].max() + 1 mf = MF(movieCount, userCount, meanRatings, alpha=0.01, reg=0.01, iterations=20, K=20) mf.train(ratings) pickle.dump(mf, open('pkl/mfModel.pkl', 'wb'))