def executeTraining(modelFileName, simOptions): knn = KNNBasic(sim_options=sim_options, k=3) knn.train(trainingSet) testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) os.makedirs('./outputs', exist_ok=True) with open(modelFileName, "wb") as file: joblib.dump(knn, os.path.join('./outputs/', modelFileName))
def run_train(trainingSet): try: # build training set # KNN model sim_options = {'name': 'cosine', 'user_based': False} knn = KNNBasic(sim_options=sim_options) knn.train(trainingSet) return knn except: raise
def knn_running_time(data): ''' Calculates the running times for training and predictions for Basic KNN Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_KnnBasictrain: running time for training elapsedtime_KnnBasictest: running time for predictions on testset ''' elapsedtime_KnnBasictrain = [] elapsedtime_KnnBasictest = [] # tune the parameters on the entire data param_grid = { 'k': [5, 10, 20], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False] } } grid_search = GridSearch(KNNBasic, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] k = param['k'] sim = param['sim_options']['name'] min_support = param['sim_options']['min_support'] user_based = param['sim_options']['user_based'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() knn = KNNBasic(k=k, name=sim, min_support=min_support, user_based=user_based) knn.train(training) elapsedtime_KnnBasictrain.append(time.time() - training_start) # prediction running time test_start = time.time() knn.test(testing) elapsedtime_KnnBasictest.append(time.time() - test_start) return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
def user_based_cf(co_pe): # INITIALIZE REQUIRED PARAMETERS # path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe # -------------------------------`-------------- MARKERS f = io.open("_AlgoHist_ub.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "CF Type:", prnt, "BASED" # PEEKING PREDICTED VALUES search_key = raw_input("Enter User ID:") item_id = raw_input("Enter Item ID:") actual_rating = input("Enter actual Rating:") print algo.predict(str(search_key), item_id, actual_rating) testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) result_u = True k = input("Enter size of Neighborhood (Min:1, Max:40)") inner_id = algo.trainset.to_inner_iid(search_key) neighbors = algo.get_neighbors(inner_id, k=k) print "Nearest Matching users are:" for i in neighbors: print "\t " * 6, i return top_n, result_u
def test_nearest_neighbors(): """Ensure the nearest neighbors are different when using user-user similarity vs item-item.""" reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(data_file, reader) trainset = data.build_full_trainset() algo_ub = KNNBasic(sim_options={'user_based': True}) algo_ub.train(trainset) algo_ib = KNNBasic(sim_options={'user_based': False}) algo_ib.train(trainset) assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
class FactPrediction: """FactPrediction definition.""" def train(self): """Trains the model.""" from surprise import Reader, Dataset, KNNBasic directory = path.dirname(path.realpath(__file__)) ratings = read_csv(path.join(directory, 'fact_ratings.csv')) ratings = Dataset.load_from_df(ratings[['userId', 'factId', 'rating']], Reader()) trainset = ratings.build_full_trainset() self.model = KNNBasic() self.model.train(trainset) def predict(self, u_id, f_id): """Performs a prediction.""" return self.model.predict(u_id, f_id)
def item_based_cf(self, co_pe, df_path): # INITIALIZE REQUIRED PARAMETERS # INITIALIZE REQUIRED PARAMETERS path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item' prnt = "ITEM" sim_op = {'name': co_pe, 'user_based': False} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep=',', rating_scale=(1, 5)) df = Dataset.load_from_file(df_path, reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic res = algo.train(trainset) print "\t\t >>>TRAINED SET<<<<\n\n", res # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = self.read_item_names(path) print "CF Type:", prnt, "BASED" search_key = raw_input( "Enter a Movie Name, \n ex. Toy Story (1995) or Seven (Se7en) (1995)\n Movie name:" ) print "ALGORITHM USED : ", co_pe raw_id = name_to_rid[search_key] # --------------------------------------------- MARKERS f = io.open("cluster/AlgoHist_ib.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "\t\t RAW ID>>>>>>>", raw_id, "<<<<<<<" inner_id = algo.trainset.to_inner_iid(raw_id) print "INNER ID >>>>>", inner_id # Retrieve inner ids of the nearest neighbors of Toy Story. k = input("Enter size of Neighborhood (Min:1, Max:40)") neighbors = algo.get_neighbors(inner_id, k=k) neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors) neighbors = (rid_to_name[rid] for rid in neighbors) print "Nearest ", k, " Matching Items are:" for i in neighbors: print "\t " * 6, i
def gen_pred_matrix_ubcf(co_pe): # ---------------------------------------------------- UBCF as is # INITIALIZE REQUIRED PARAMETERS # path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe print "CF Type:", prnt, "BASED" testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) # ---------------------------------------------------- UBCF as is csvfile = 'pred_matrix-full_ubcf.csv' with open(csvfile, "w") as output: writer = csv.writer(output, delimiter=',', lineterminator='\n') writer.writerow(['uid', 'iid', 'rat']) for uid, user_ratings in top_n.items(): for (iid, r) in user_ratings: value = uid, iid, r writer.writerow(value) print "Done! You may now check the file in same Dir. as of Program"
def knn(data, training, testing): ''' Tune Basic KNN parameters then calculates RMSE, coverage and running time of Basic KNN Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of Basic KNN with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False]}} # optimize parameters knn_grid_search = GridSearch(KNNBasic, knn_param_grid, measures=['RMSE'], verbose=False) knn_grid_search.evaluate(data) param = knn_grid_search.best_params['RMSE'] print('KNNBasic:', param) # RMSE against parameters result_df = pd.DataFrame.from_dict(knn_grid_search.cv_results) result_df.to_csv('data/knn_rmse_against_param.csv') # fit model using the optimized parameters knn = KNNBasic(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'] ) knn.train(training) # evaluate the model using test data predictions = knn.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def gen_pred_matrix_ibcf(co_pe): # ---------------------------------------------------- IBCF as is # INITIALIZE REQUIRED PARAMETERS path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item' prnt = "ITEM" sim_op = {'name': co_pe, 'user_based': False} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic res = algo.train(trainset) print "\t\t >>>TRAINED SET<<<<\n\n", res # Read the mappings raw id <-> movie name # rid_to_name, name_to_rid = read_item_names(path) print "CF Type:", prnt, "BASED" print "Please be Patient while 'pred_matrix-full_ibcf.csv' is being Generated" for i in range(5): print "." time.sleep(0.5) # --------------------------------------------------------- EXPERIMENTAL testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) # --------------------------------------------------------- EXPERIMENTAL # ---------------------------------------------------- IBCF as is csvfile = 'pred_matrix-full_ibcf.csv' with open(csvfile, "w") as output: writer = csv.writer(output, delimiter=',', lineterminator='\n') writer.writerow(['uid', 'iid', 'rat']) for uid, user_ratings in top_n.items(): for (iid, r) in user_ratings: value = uid, iid, r writer.writerow(value) print "Done! You may now check the file in same Dir. as of Program"
def compute_recommendations(): #connecting to the database # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True) engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) # disable print blockPrint() #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() #formatting the dataset using the surprise library reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_df(df_ratings, reader=reader) training_set = data.build_full_trainset() algorithm = KNNBasic() # use the singular value decomposition algorithm.train(training_set) # fit the data to the model testing_set = training_set.build_anti_testset() predictions = algorithm.test(testing_set) # make prediction #writing the function for top predictions def get_top_n(predictions, n=10): # Return the top-N recommendation for each user from a set of predictions. # Args: # predictions(list of Prediction objects): The list of predictions, as # returned by the test method of an algorithm. # n(int): The number of recommendation to output for each user. Default # is 10. # Returns: # A dict where keys are user (raw) ids and values are lists of tuples: # [(raw item id, rating estimation), ...] of size n. # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # getting the top 10 predictions top_n = get_top_n(predictions, n=10) # Print the recommended items for each user a = [] for uid, user_ratings in top_n.items(): a.append([uid, [iid for (iid, _) in user_ratings]]) df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B']) df_user = pd.DataFrame(df_list_pred.A.values.tolist()) df_pred = pd.DataFrame(df_list_pred.B.values.tolist()) df_pred.columns = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_items = pd.read_sql('SELECT * FROM items;', con=engine) # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title']) df_pred[['id']] = df_user df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) # Append recomemndations df_pred.to_sql('recommendations', engine, if_exists='append', index=False) #if_exists='append' session.commit() #logging the predictions df_log = df_pred df_log['algorithm'] = 'KNNBasic' df_log = df_log.rename(columns={'id': 'user_id'}) df_log = df_log[[ 'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm' ]] df_log.to_sql('predictionlogs', engine, if_exists='append', index=False) #if_exists='append' session.commit() global mae1 global rmse1 mae1 = accuracy.mae(predictions) rmse1 = accuracy.rmse(predictions) mae1 = float(mae1) rmse1 = float(rmse1)
surprise_cross_validate(algo, data, sim_options) # Gridsearch KNNBaseline param_grid = {'k': [18, 19, 20, 21, 22]} print(surprise_gridsearch(param_grid, KNNBasic, data)) # Cross-Validate KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(k=19, sim_options=sim_options) surprise_cross_validate(algo, data, sim_options) # Predictions trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(k=19, sim_options=sim_options) algo.train(trainset) predictions = algo.test(trainset.build_testset()) # Build Pandas DF of Ratings and Predictions df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) df['#_of_Movies_Rated_By_User'] = df.uid.apply(get_Iu) df['#_of_Users_That_Rated_This_Movie'] = df.iid.apply(get_Ui) df['Error_in_Rating_Prediction'] = abs(df.est - df.rui) df.rename(columns={ 'uid': 'User_ID', 'iid': 'Movie_ID', 'rui': 'User_Rating', 'est': 'Predicted_Rating' }, inplace=True)
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'Item-based KNN' sim_options = {'user_based': False} algorithm = KNNBasic(sim_options=sim_options) # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session(sessionmaker(bind=engine, autocommit = False, autoflush = False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine) df_ratings=df_ratings[['user_id','item_id','rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id','item_id','rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con = engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols =['pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10'] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10']] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append' session.commit() predcols =['num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10'] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10']] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append' session.commit()
pred[3] ######################### # http://surprise.readthedocs.io/en/stable/prediction_algorithms.html # change the prediction algorithm to knn sim_options = {'name': 'cosine', 'user_based': False # compute similarities between items } # http://surprise.readthedocs.io/en/stable/similarities.html #sim_options = {'name': 'pearson', # ''user_based': True # } algo_1 = KNNBasic(sim_options= sim_options) trainset = data.build_full_trainset() algo_1.train(trainset) pred = algo_1.predict('374', '500') print("Prediction Object:") pred print("Predicted Rating:") pred[3] # print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5
from surprise import Reader, Dataset, KNNBasic # break data file down into an array full of strings with open('./data.txt') as f: all_lines = f.readlines() # load information from file into dataset using reader reader = Reader(line_format='item user rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_file('./data.txt', reader=reader) # split dataset into n folds, can be changed data.split(n_folds=5) # using mean squared difference similarity measure here, with min_support set to 1 to consider only users who have at least 1 movie in common sim_options = {'name': 'msd', 'user_based': False, 'min_support': 1} trainingset = data.build_full_trainset() # uses basic KNN algorithm to create a training set algorithm = KNNBasic(sim_options=sim_options) algorithm.train(trainingset) # predict rating using item and user ID as input userid = str(input("Please enter user ID: ")) itemid = str(input("Please enter movie ID: ")) print(algorithm.predict(userid, itemid))
class BaselineMF: def __init__(self, cf_algo=None, logit=False): """ fit method takes a ContentDataset and fits it for num_epochs (passed at initialisation) Parameters ---------- batch_size (int): the size of each training batch network (ContentMF): a network that fits using user_ids and item_texts num_epochs (int): the number of training epochs optim_params (dict): parameters passed to the Stochastic Gradient Descent (SGD) class use_cuda (bool): set to True to use the GPU """ self.logit = logit self.question_truth_dict = {} self.average_true_rating = 0.5 self.average_false_rating = 0.5 self.loss_fn = nn.MSELoss(size_average=True) if cf_algo is None: self.cf_algo = KNNBasic(k=2) else: self.cf_algo = cf_algo #self.svd = SVD(n_epochs=500, verbose=True, lr_all=0.001, n_factors=50) def dataloader_extract(self, sample): ratings = pd.Series(np.array(list(sample['rating']))) user_ids = pd.Series(sample['user_id']).astype(str) item_ids = pd.Series(sample['item_id']).astype(str) return ratings, user_ids, item_ids def logit_fn(self, p, epsilon=1e-3): for item in p: if item == 0: item = epsilon if item == 1: item = 1 - epsilon return np.log(p / (1 - p)) def sigmoid_fn(self, x): return 1 / (1 + np.exp(-x)) def fit(self, dataset, train_sampler): """Runs the fit method which simply works out the average response for 'true' and 'false' questions, where 'true' questions are those where the average rating is greater than 0.5""" t0 = time.time() data_loader = DataLoader(dataset, batch_size=len(train_sampler), sampler=train_sampler) sample = iter(data_loader).next() ratings, user_ids, item_ids = self.dataloader_extract(sample) if self.logit: ratings = self.logit_fn(ratings) possible_ratings = ratings.unique() ratings_dict = { 'itemID': item_ids, 'userID': user_ids, 'rating': ratings } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() self.cf_algo.train(trainset) def predict(self, dataset, sampler, batch_size=64): # I'm not entirely sure that the build_full_testset # function works as I'd expect, so instead we loop # through all the test ids and predict one-at-a-time preds = [] data_loader = DataLoader(dataset, batch_size=len(dataset), sampler=sampler) sample = iter(data_loader).next() ratings, user_ids, item_ids = self.dataloader_extract(sample) for user_id, item_id in zip(user_ids, item_ids): pred = self.cf_algo.predict(str(user_id), str(item_id))[3] if self.logit: pred = self.sigmoid_fn(pred) preds.append(pred) return (preds) def score(self, dataset, sampler, batch_size=64, only_slow=True): """Scores the baseline on predictions made on the dataset provided, sampled with the given sampler. If `only_slow` is true, then only the slow judgments in the sampled part of the dataset are scored""" predictions = self.predict(dataset, sampler, batch_size) data_loader = DataLoader(dataset, batch_size=len(dataset), sampler=sampler) testset = iter(data_loader).next() ratings, user_ids, item_ids, = self.dataloader_extract(testset) user_ids = user_ids.astype(int) ratings = torch.Tensor(ratings) predictions = torch.Tensor(predictions) #Note that all baselines are passed flattened datasets, so we # have to work out which of the users correspond to the latest # times if only_slow: long_time_uids = [i for i in np.unique(user_ids) if i % 3 == 2] new_ratings = [] new_preds = [] for index, rating in enumerate(ratings): if user_ids[index] in long_time_uids: new_ratings.append(rating) for index, pred in enumerate(predictions): if user_ids[index] in long_time_uids: new_preds.append(pred) loss = self.loss_fn(torch.Tensor(new_preds), torch.Tensor(new_ratings).cpu()) return loss.cpu().data.item() else: loss = self.loss_fn(predictions, ratings.cpu()) return loss.cpu().data.item()
from surprise import KNNBasic, Reader, Prediction from surprise import Dataset from surprise.model_selection import KFold from surprise import accuracy import surprise.prediction_algorithms.algo_base.AlgoBase reader = Reader(line_format='user item rating', sep=' ', skip_lines=1, rating_scale=(1, 40000)) data = Dataset.load_from_file('collaborative.csv', reader=reader) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBasic(sim_options=sim_options) algo.train(data) kf = KFold(n_splits=10) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset, verbose=True) rmse = accuracy.rmse(predictions, verbose=True) mae = accuracy.mae(predictions, verbose=True)
for i in range(0, len(k_neig)): knnbasic_ambiente = KNNBasic(k=k_neig[i]) perf = evaluate(knnbasic_ambiente, data, measures=['RMSE', 'MAE'], verbose=0) print('K es ', k_neig[i], 'media', np.array(perf['rmse']).mean()) #mejor k de ambiente es 40 knnbasic_ambiente = KNNBasic(k=40) # Retrieve the trainset. trainset = data.build_full_trainset() knnbasic_ambiente.train(trainset) from sklearn.externals import joblib joblib.dump(knnbasic_ambiente, 'knnbasic_ambiente.pkl') ####comida knn###### train_reducido[['id_usuario', 'id_restaurante', 'rating_comida', 'fecha']].to_csv('knn_comida.csv', index=False) file_path = 'knn_comida.csv' reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
#importing surprise package and builtin data from surprise import Dataset, evaluate from surprise import KNNBasic from collections import defaultdict # loading data dataset = Dataset.load_builtin("ml-100k") trainingSet = dataset.build_full_trainset() trainingSet # cosine similarity between 2 vectors sim_options = {'name': 'cosine', 'user_based': False} knn = KNNBasic(sim_options=sim_options) # training the model knn.train(trainingSet) # movie recommendations for users testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) #top three movie recommendations for each user. def get_top5_recommendations(predictions, topN=5): top_recs = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_recs[uid].append((iid, est)) for uid, user_ratings in top_recs.items():