def main(): f = open("Python/user_rated_movies.tsv", "r") user_ratings = [] for line in f: inline = line.split('\t') rating = inline[2] mytuple = inline[0], inline[1], float(rating[:-1]), None user_ratings.append(mytuple) f.close() # data = Dataset.load_builtin(name=u'ml-1m') reader = Reader(line_format='user item rating', sep='\t') datain = pd.read_csv("ratings.tsv", sep="\t") data = Dataset.load_from_df(datain, reader=reader) for i in user_ratings: data.raw_ratings.append(i) movies = pd.read_csv("movies.tsv", sep="\t", header=None, low_memory=False) algo = NMF(n_factors=4, n_epochs=100, random_state=1) trainSet = data.build_full_trainset() algo.fit(trainSet) predictions = [] #have i[0] and i[1] be the current user and movie id for index, row in movies.iterrows(): pred = algo.predict(user_ratings[0][0], row[1], r_ui=4) predictions.append(pred) sortpred = sorted(predictions, key=lambda pred: pred[3]) sortpred = sortpred[-10:] for i in sortpred: print(i[1])
def run_process(self, all_ips_data, ip_16_data, misclassifications, queue): historical_item = generate_prefix_data(all_ips_data, ip_16_data, self.reference_end_time, self.half_life_duration) matrix = [] if len(historical_item) == 0: return if len(historical_item) < 5: for ip, bl_name_data in historical_item.items(): queue.put(ip + ",0") return matrix_string = "userId,itemId,rating\n" all_blacklists = set() ip_order = set() for ip, bl_name_data in historical_item.items(): ip_order.add(ip) for bl_name, score in bl_name_data.items(): matrix_string = matrix_string + ip + "," + bl_name + "," + str( score) + "\n" all_blacklists.add(bl_name) for ip in misclassifications: if ip in ip_order: matrix_string = matrix_string + ip + "," + "misclassifications,10" + "\n" matrix_string = StringIO(matrix_string) ratings = pd.read_csv(matrix_string) ratings_dict = { 'itemID': list(ratings.itemId), 'userID': list(ratings.userId), 'rating': list(ratings.rating) } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 10.0)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) epochs = 100 broken_flag = False while True: algo = NMF(n_epochs=epochs, n_factors=self.n_factors) try: res = model_selection.cross_validate(algo, data, measures=['RMSE']) except: broken_flag = True break mean_rmse = sum(res["test_rmse"]) / len(res["test_rmse"]) if mean_rmse <= 1: break epochs = epochs + 100 if epochs >= self.epochs: break for ip in ip_order: prediction = algo.predict(ip, "misclassifications").est queue.put(ip + "," + str(round(prediction, 2))) return
def nmf(train, test, ids, Xtest, Xids): """ Non Negative Matrix Factorization Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('NMF') algo = NMF(n_factors=20, n_epochs=50, random_state=15, reg_pu=0.5, reg_qi=0.05) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def nmf(self, namefile, uid, iid, rati, value_uid, value_iid): test_data = pd.read_csv('./container/' + namefile) dt = pd.DataFrame(test_data) # Retrieve the trainset. reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(dt[[uid, iid, rati]], reader) trainset = data.build_full_trainset() algo = NMF() algo.fit(trainset) pred = algo.predict(int(value_uid), int(value_iid), r_ui=1, verbose=True) #var_rmse = accuracy.rmse(pred) #return result to json jsondata = {} jsondata = {} jsondata["uid"] = pred.uid jsondata["idd"] = pred.iid jsondata["rati"] = round(pred.est, 2) return jsondata
def nmf_from_to(self, namefile, uid, iid, rati, from_uid, to_uid, from_iid, to_iid): test_data = pd.read_csv('./container/' + namefile) dt = pd.DataFrame(test_data) # Retrieve the trainset. reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(dt[[uid, iid, rati]], reader) trainset = data.build_full_trainset() algo = NMF() algo.fit(trainset) arr = [] for value_uid in range(from_uid, to_uid): for value_iid in range(from_iid, to_iid): pred = algo.predict(value_uid, value_iid, r_ui=1, verbose=True) tempdata = [] tempdata.append(pred.uid) tempdata.append(pred.iid) tempdata.append(round(pred.est, 2)) arr.append(tempdata) #return result to json return arr
Pred_Test_SVD = [] Pred_Test_NMF = [] Pred_Test_SL1 = [] Pred_Test_KNN = [] Pred_Test_BSL = [] start = time.time() for line in data_test: Pred_Test_KNN.append( alg_KNN.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_SVD.append( alg_SVD.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_NMF.append( alg_NMF.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_SL1.append( alg_SL1.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_BSL.append( alg_BSL.predict(str(line[1]), str(line[0]), clip=False).est) end = time.time() print("***********************************************") print("Exe time:") print(end - start) X_Test = np.matrix([ Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN, Pred_Test_BSL ])
import zipfile #Unzip the file """file = zipfile.ZipFile('/home/shanmukha/AnacondaProjects/Spyder_projects/Recommendation_trail/ml-100k.zip','r') file.extractall() file.close() """ #Read dataset reader = Reader(line_format='user item rating timestamp', sep='\t') dataset = Dataset.load_from_file(file_path='./ml-100k/u.data', reader=reader) #Split dataset dataset.split(n_folds=5) #Using SVD,NMF algo1 = SVD() algo2 = NMF() #evaluate(algo,dataset,measures=['RMSE','MAE']) #Training entire dataset train_data = dataset.build_full_trainset() algo1.fit(train_data) algo2.fit(train_data) #predicting user = str(196) item = str(302) actual_rating = 4 print(algo1.predict(user, item, actual_rating)) print(algo2.predict(user, item, actual_rating))
del dfTest['date'] del dfTest['test_id'] # Set the rating scale and create the data for Surprise to use reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( dfRatings[['user_id', 'business_id', 'rating']], reader) # Cross validation for tuning # Split in 5 folds data.split(5) # This part is to use all the data to train and get the output train_set = data.build_full_trainset() # Use NMF with surprise algo = NMF() algo.train(train_set) f = open('PMFOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dfTest)): prediction = algo.predict(dfTest.at[i, 'user_id'], dfTest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) ratings[ratings['userId'] == 1] svd.predict(13, 238) m_cols = ['id', 'Title', 'release_date', 'video_release_date', 'imdb_url'] moviesdb = pd.read_csv('./ml-100k/u.item', sep='|', names=m_cols, usecols=range(5), encoding='latin-1') nmf = NMF() cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) nmf.fit(trainset) nmf.predict(13, 238) knnb = KNNBasic() cross_validate(knnb, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) knnb.fit(trainset) knnb.predict(13, 238) knnm = KNNWithMeans() cross_validate(knnm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) knnm.fit(trainset) knnm.predict(13, 238) user = 13 user_rating_svd = pd.DataFrame() for i in range(0, moviesdb.shape[0]):
# Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) # Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item. algoritmo = NMF(n_epochs=5) algoritmo.fit(trainset) # Selecionamos o usuário e o filme que será analisado # User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas uid = str(49) # Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4 iid = str(2058) # raw item id # get a prediction for specific users and items. pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algoritmo.test(testset) # Avalia RMSE print("Avaliação RMSE: ") accuracy.rmse(test_pred, verbose=True) # Avalia MAE print("Avaliação MAE: ") accuracy.mae(test_pred, verbose=True)
trainset = data.build_full_trainset() del data print(time.asctime(), 'training set built, now training') # algo = SlopeOne() # # # MODEL DEFINITION algo = NMF(verbose=True, biased=True, n_factors=fac, n_epochs=ep) # # # algo.fit(trainset) print(time.asctime(), 'training complete, now loading prediction data') to_predict = pd.read_csv(file_path_test, delimiter=' ', header=None) to_predict = to_predict.values.T[0:2].T predicted = np.zeros(len(to_predict)) print(time.asctime(), 'prediction data loaded, now predicting') for i in range(len(predicted)): user = to_predict[i][0] item = to_predict[i][1] predicted[i] = algo.predict(uid=user, iid=item, verbose=0).est if (i % 500000 == 0): print(i, 'of', len(predicted), 'predicted') print(time.asctime(), 'now saving predictions') np.savetxt('../custom_data/' + title + '.dta', predicted, fmt='%.3f') print(time.asctime(), 'done')
# -*- coding:utf-8 -*- __author__ = 'neuclil' import surprise from surprise import NMF, evaluate from surprise import Dataset, Reader from model.convertor import Convertor import os if __name__ == '__main__': convetor = Convertor() file_path = os.path.expanduser('../data/popular_music_suprise_format.txt') reader = Reader(line_format='user item rating timestamp', sep=',') music_data = Dataset.load_from_file(file_path, reader=reader) algo = NMF() trainset = music_data.build_full_trainset() algo.train(trainset) user_inner_id = 4 user_rating = trainset.ur[user_inner_id] items = map(lambda x: x[0], user_rating) for song in items: print( algo.predict(algo.trainset.to_raw_uid(user_inner_id), algo.trainset.to_raw_iid(song), r_ui=1), convetor.get_song_name_by_iid(algo.trainset.to_raw_iid(song))) surprise.dump.dump('./nmf.model', algo=algo)
from surprise import Reader, Dataset from surprise import NMF, evaluate # creating the format for the dataset when given the user, item, rating and timestamp data_reader = Reader(line_format="user item rating timestamp", sep="\t") # store the data in the specific format created above # u. data is the data we want data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader) # will be splitting the data into 5 folds for cross validation data.split(n_folds=5) # for this project I will be using the NMF algorithm algorithm = NMF() evaluate(algorithm, data, measures=["RMSE", "MAE"]) # train the whole data set now training_set = data.build_full_trainset() algorithm.train(training_set) # set the specific user and movie I want to predict user_id = str(200) item_id = str(222) actual_rating = 5 # see how it works! print(algorithm.predict(user_id, item_id, actual_rating))
predict = clf.predict(X_test) print( f"SVM Accuracy Score: {metrics.accuracy_score(Y_test,predict)*100:0.4f}%") #NMF data = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1) data2 = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(data, reader) NMFModel = NMF() NMFModel.fit(data.build_full_trainset()) predicted = [] for indx, row in data2.iterrows(): pred = NMFModel.predict(uid=row['reviewerID'], iid=row['asin']).est predicted.append(pred) true = df['overall'].tolist() acc = 0 for i in range(len(true)): if int(round(predicted[i])) == true[i]: acc += 1 avg = acc / len(true) print(f"NMF Accuracy: {avg*100:0.4f}%") #K-nearest Neighbor KNModel = KNeighborsClassifier(n_neighbors=3) KNModel.fit(X_train, Y_train)
# %% Look at the prior on the train data file_path = "Data/data_train_preprocessed.csv" data_train = utils.load_data_desired(file_path) # %% Labels for training Pred_NotCliped_label = [] Pred_Cliped_label = [] Real_label = [] Clip = False for line in data_train: Real_label.append(line[2]) Pred_NotCliped_label.append( alg.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Cliped_label.append( alg.predict(str(line[1]), str(line[0]), clip=True).est) Pred_NotCliped_label = np.array(Pred_NotCliped_label) Pred_Cliped_label = np.array(Pred_Cliped_label) Real_label = np.array(Real_label) # %% Visualization plt.figure() plt.hist(Pred_NotCliped_label) plt.grid() plt.title('Histogram of Predicted Labels') plt.xlabel('Label') plt.figure()
alg.fit(data_train.build_full_trainset()) end = time.time() print("***********************************************") print("Exe time:") print(end - start) # %% Loading Test Data file_path = "Data/sample_submission.csv" data_test = utils.load_data_desired(file_path) # %% Prediction Predict_Test = [] for line in data_test: Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est) # %% Save Prediction file = open("Details.txt", "w") file.write("+ Best Score: \n \n") file.write(str(Train_CV.best_score) + "\n \n") file.write("************************************************************ \n") file.write("+ Best Param: \n \n") file.write(str(Train_CV.best_params) + "\n \n") file.write("************************************************************ \n") file.write("+ CV Summary: \n \n") file.write(str(Train_CV.cv_results) + "\n \n") file.write("************************************************************ \n")
for hist in testset: if hist[0] in user_visiting_hist: user_visiting_hist[hist[0]].append(hist[1]) else: user_visiting_hist[hist[0]] = [hist[1]] algo.fit(trainset) # Making recommendation for each user and all venues in test data set precision = 0.0 recall = 0.0 k = 20 for user in user_list: est_item_rating = {} for item in item_list: est_item_rating[item] = algo.predict(user, item, clip=False).est # print(algo.predict(user, item, clip=False).est) sorted_items_dict = OrderedDict(sorted(est_item_rating.items())) sorted_items = list(sorted_items_dict.keys()) count = 0 for i in sorted_items[:k]: if i in user_visiting_hist[user]: count += 1 precision += count / float(k) recall += count / float(len(user_visiting_hist[user])) print('precision: ', precision / len(user_list), ' recall: ', recall / len(user_list)) overall_precision += precision / len(user_list) overall_recall += recall / len(user_list) print('overall_precision: ', overall_precision / 5, ' overall_recall: ', overall_recall / 5)
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'NMF' algorithm = NMF() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session(sessionmaker(bind=engine, autocommit = False, autoflush = False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine) df_ratings=df_ratings[['user_id','item_id','rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id','item_id','rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con = engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols =['pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10'] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10']] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append' session.commit() predcols =['num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10'] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10']] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append' session.commit()
n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) algo_SVD.train(data_full) #%% datamat_filled_SVD = datamat_missing.copy().astype(np.float) datamat_filled_NMF = datamat_missing.copy().astype(np.float) for i in range(0, datamat_full.shape[0]): # movie for j in range(0, datamat_full.shape[1]): # user val = algo_SVD.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est datamat_filled_SVD[i, j] = val val = algo_NMF.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est datamat_filled_NMF[i, j] = val #%% compute correlations between real and recovered ratings corvals_SVD = np.zeros(datamat_full.shape[1]) corvals_NMF = np.zeros(datamat_full.shape[1]) corvals_SVD_fancy = np.zeros(datamat_full.shape[1]) corvals_NNM_fancy = np.zeros(datamat_full.shape[1]) corvals_SOFT_fancy = np.zeros(datamat_full.shape[1]) for j in range(0, datamat_full.shape[1]): # user corvals_SVD[j] = np.corrcoef(datamat_full[:, j], datamat_filled_SVD[:, j])[0, 1] corvals_SVD_fancy[j] = np.corrcoef(datamat_full[:, j], datamat_filled_SVD_fancy[:, j])[0, 1] corvals_NMF[j] = np.corrcoef(datamat_full[:, j], datamat_filled_NMF[:, j])[0, 1]