def user_factorization(data_raw, user_clusters, params): n_factors = params["LOCAL_U_NMF_K"] user_df = pd.DataFrame() for i in range(user_clusters): u_i = data_raw[data_raw["user cluster"] == i] reader = surprise.Reader(rating_scale=(1, 5)) dataset = surprise.Dataset.load_from_df( u_i[["User", "Movie", "Prediction"]], reader) trainset = dataset.build_full_trainset() algo = NMF(n_factors=n_factors, n_epochs=params["LOCAL_U_NMF_EPOCHS"], verbose=True) algo.fit(trainset) testset = trainset.build_testset() preds = algo.test(testset) predictions_train = pd.DataFrame(preds) testset = trainset.build_anti_testset() preds = algo.test(testset) predictions_rest = pd.DataFrame(preds) user_df = pd.concat([user_df, predictions_train, predictions_rest], ignore_index=False, copy=False) all_u_m = get_all_u_m() user_df = all_u_m.merge(user_df, how="left", on=["uid", "iid"]) user_df = user_df[["uid", "iid", "est"]] logging.info("return from user_factorization") return user_df
def item_factorization(data_raw, item_clusters, user_df, params): n_factors = params["LOCAL_I_NMF_K"] item_df = pd.DataFrame() for i in range(item_clusters): i_i = data_raw[data_raw["item cluster"] == i] reader = surprise.Reader(rating_scale=(1, 5)) dataset = surprise.Dataset.load_from_df( i_i[["User", "Movie", "Prediction"]], reader) trainset = dataset.build_full_trainset() algo = NMF(n_factors=n_factors, n_epochs=params["LOCAL_I_NMF_EPOCHS"], verbose=True) algo.fit(trainset) #i_i.rename(columns={"User":"******","Movie":"iid","Prediction":"est"},inplace=True) testset = trainset.build_testset() preds = algo.test(testset) predictions_train = pd.DataFrame(preds) testset = trainset.build_anti_testset() preds = algo.test(testset) predictions_rest = pd.DataFrame(preds) item_df = pd.concat([item_df, predictions_train, predictions_rest], ignore_index=False, copy=False) item_df = user_df[["uid", "iid"]].merge(item_df, how="left", on=["uid", "iid"]) item_df["est"].loc[item_df["est"].isnull()] = 0 logging.info("return from item_factorization") return item_df
def nmf(data, training, testing): ''' Tune NMF parameters then calculates RMSE, coverage and running time of NMF Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of NMF with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]} # optimize parameters grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('NMF:', param) # fit model using the optimized parameters nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs']) nmf.train(training) # evaluate the model using test data predictions = nmf.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def predict_NMF(userid): df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1) reader = Reader(rating_scale=(1, 30)) #使用reader格式从文件中读取数据 data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader) #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集 trainset, testset = train_test_split(data, test_size=.25) #使用NMF algo = NMF() algo.fit(trainset) pred_nmf = algo.test(testset) top_nmf_n = get_top_n(pred_nmf, n=5) movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title']) movie_titles = movie_titles.rename(columns={'id': 'movieId'}) movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'], errors='coerce').fillna(0) movie_titles['movieId'] = movie_titles['movieId'].astype('int') movie_titles.drop_duplicates() for uid, user_ratings in top_nmf_n.items(): if (uid == userid): #print(uid, [iid for (iid, _) in user_ratings]) title_list = [iid for (iid, _) in user_ratings] titles = movie_titles[movie_titles.movieId.isin(title_list)] print(titles[2:]) return titles[2:]
def recommender_nmf_baseline(self, train_file, test_file, output): train, test, train_dataset, test_dataset = prepare_datasets( train_file, test_file) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo_nmf_baseline = NMF() algo_nmf_baseline.fit(train) #not_seen_elems = self.merge_train_set(train_dataset, test_dataset) #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True) predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False) #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0) # Precision and recall can then be averaged over all users #precision_avg = sum(prec for prec in precisions.values()) / len(precisions) #recall_avg = sum(rec for rec in recalls.values()) / len(recalls) #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str( # rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False))) print('NMF BASELINE: ' + ' RMSE ' + str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' + str(mae(predictions_nmf_baseline, verbose=False))) return algo_nmf_baseline
def do_nmf(data_raw, impute_params): data = data_raw.pivot(index="User", columns="Movie", values="Prediction").to_numpy() reader = surprise.Reader(rating_scale=(1, 5)) dataset = surprise.Dataset.load_from_df( data_raw[["User", "Movie", "Prediction"]], reader) trainset = dataset.build_full_trainset() algo = NMF(n_factors=impute_params["FACTORS"], n_epochs=impute_params["EPOCHS"], verbose=True) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) predictions = pd.DataFrame(predictions) predictions.rename(columns={ "uid": "User", "iid": "Movie", "est": "Prediction" }, inplace=True) predictions = predictions[["User", "Movie", "Prediction"]] data = pd.concat([data_raw, predictions], ignore_index=True) data = data.pivot(index="User", columns="Movie", values="Prediction").to_numpy() return data
def colaborative_filtering_based_model(path, config, engine, df_valid_games): with open(path, 'r') as f: raw_strings = f.readlines() total_count = len(raw_strings) current_count = 0 user_ratings = [] scaler = MinMaxScaler((1, 5)) for raw_string in raw_strings: user_id, user_inventory = list(json.loads(raw_string).items())[0] if user_inventory is not None: app_ids = [item['appid'] for item in user_inventory] app_scores = [item['playtime_forever'] for item in user_inventory] app_scores = scaler.fit_transform(np.log1p(app_scores).reshape(-1, 1)) user_ratings_temp = [[user_id, app_ids[i], app_scores[i].item()] for i in range(len(app_ids))] user_ratings += user_ratings_temp show_work_status(1,total_count,current_count) current_count+=1 user_item_ratings = pd.DataFrame(user_ratings) user_item_ratings.columns = ['user_id', 'item_id', 'rating'] # Prediction part game_ids_set = set(df_valid_games.steam_appid) grouped_user_item_ratings = user_item_ratings.groupby('user_id') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(user_item_ratings[['user_id', 'item_id', 'rating']], reader) alg = NMF(n_factors=20) alg.fit(data.build_full_trainset()) total_count = len(user_item_ratings.user_id.unique()) current_count = 0 dict_user_recommendations = {} for user in user_item_ratings.user_id.unique().tolist(): temp = grouped_user_item_ratings.get_group(user) not_purchased_ids = game_ids_set - set([str(x) for x in temp.item_id]) user_test_temp = [[user, not_purchased_id, 0] for not_purchased_id in not_purchased_ids] user_test_temp = pd.DataFrame(user_test_temp) user_test_temp.columns = ['user_id', 'item_id', 'rating'] data = Dataset.load_from_df(user_test_temp[['user_id', 'item_id', 'rating']], reader) user_test = data.build_full_trainset().build_testset() results = alg.test(user_test) dict_user_recommendations.update({user: pd.DataFrame(results).sort_values('est', ascending=False).iloc[:10, 1].values.tolist()}) show_work_status(1,total_count,current_count) current_count+=1 df_cf_based_results = pd.DataFrame(dict_user_recommendations).T df_cf_based_results.index.name = 'user_id' df_cf_based_results.reset_index(inplace=True) df_cf_based_results.to_sql(config.mysql_user_like_table, engine, if_exists='replace')
def nmf(train, test, ids, Xtest, Xids): """ Non Negative Matrix Factorization Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('NMF') algo = NMF(n_factors=20, n_epochs=50, random_state=15, reg_pu=0.5, reg_qi=0.05) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def nmf_running_time(data): ''' Calculates the running times for training and predictions for NMF Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_NMFtrain: running time for training elapsedtime_NMFtest: running time for predictions on testset ''' elapsedtime_NMFtrain = [] elapsedtime_NMFtest = [] # tune the parameters on the entire data param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]} grid_search = GridSearch(NMF, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] n_factors = param['n_factors'] n_epochs = param['n_epochs'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() nmf = NMF(n_factors=n_factors, n_epochs=n_epochs) nmf.train(training) elapsedtime_NMFtrain.append(time.time() - training_start) # prediction running time test_start = time.time() nmf.test(testing) elapsedtime_NMFtest.append(time.time() - test_start) return elapsedtime_NMFtrain, elapsedtime_NMFtest
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF, bestParamsKNN): ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset() modelNMF = NMF(**bestParamsNMF) modelNMF.fit(ratingsTrainTrainset) saveModel(modelNMF, 'NMF') predictions = modelNMF.test(ratingsTest) rmseValue = rmse(predictions) maeValue = mae(predictions) saveFinalResult('NMF', rmseValue, maeValue) modelKNN = KNNWithMeans(**bestParamsKNN) modelKNN.fit(ratingsTrainTrainset) saveModel(modelKNN, 'KNN') predictions = modelKNN.test(ratingsTest) rmseValue = rmse(predictions) maeValue = mae(predictions) saveFinalResult('KNN', rmseValue, maeValue)
class NonNegative_MF(BaseSurpriseSTLEstimator): """ Nonnegative Matrix Factorization Args: :attr:`n_factors` (int): number of latent vectors/factors for matrix factorization :attr:`n_epochs` (int): Integer, The number of iteration of the SGD procedure. Default is 20 see https://surprise.readthedocs.io/en/stable/matrix_factorization.html for more info """ def __init__(self, n_factors, n_epochs=50, name='NonNegative_MF'): super().__init__(name, 'non_feature_based') self.n_factors = n_factors self.n_epochs = n_epochs self.model = NMF(n_factors=self.n_factors, n_epochs=self.n_epochs) def _fit(self, x): self.model.fit(x) def _predict(self, x): return self.model.test(x) def get_hyper_params(self): hparams = { 'n_factors': { 'type': 'integer', 'values': [2, 150] }, 'n_epochs': { 'type': 'integer', 'values': [2, 150] } } return hparams def set_hyper_params(self, **kwargs): self.n_factors = kwargs['n_factors']
def algoFunc(train_data, test_data): SVD_var = SVD() print("Singular Value Decomposition :\n") SVD_var.fit(train_data) predict_var = SVD_var.test(test_data) SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True) SVD_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nProbabilistic Matrix Factorization :\n") PMF_var = SVD(biased=False) PMF_var.fit(train_data) predict_var = PMF_var.test(test_data) PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) PMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nNon-negative Matrix Factorization :\n") NMF_var = NMF() NMF_var.fit(train_data) predict_var = NMF_var.test(test_data) NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) NMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nUser based Collaborative Filtering algorithm :\n") UB_var = KNNBasic(sim_options={'user_based': True}) UB_var.fit(train_data) predict_var = UB_var.test(test_data) user_RMSE_var = accuracy.rmse(predict_var, verbose=True) user_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nItem based Collaborative Filtering algorithm :\n") IB_var = KNNBasic(sim_options={'user_based': False}) IB_var.fit(train_data) predict_var = IB_var.test(test_data) item_RMSE_var = accuracy.rmse(predict_var, verbose=True) item_MAE_var = accuracy.mae(predict_var, verbose=True) print("\n") return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
def train_algo(this_data): """ Fit a Non-negative Matrix Factorization algo to the data. Args: this_data - surprise.dataset; the loaded json data. Returns: predictions - surprise library object; all predictions generated by algo. """ print("Running algo...") trainset = this_data.build_full_trainset() NMF_algo = NMF(biased=False, n_epochs=50, n_factors=35) NMF_algo.fit(trainset) testset = trainset.build_anti_testset() predictions = NMF_algo.test(testset) print("Getting predictions...") return predictions
class NMF_Cosine_Recommender: """[summary] @author Will Jobs """ def __init__(self, df_users, df_movies, df_ratings, df_movie_lens_tags, biased=False): """[summary] Args: df_users ([type]): [description] df_movies ([type]): [description] df_ratings ([type]): [description] df_movie_lens_tags ([type]): [description] biased """ self.users = df_users self.movies = df_movies self.ratings = df_ratings self.ml_tags = df_movie_lens_tags self.biased = biased self.trained_nmf = False self.preprocessed = False self.trained_cosine = False self.cv_score = None self.cv_fit_time = None self.movies_merged = pd.DataFrame() self.nmf_predictions = pd.DataFrame() self.tfidf_matrix = None self.algo = None self.W = None self.H = None def preprocess_tags(self, verbose=True): """[summary] Args: verbose (bool, optional): [description]. Defaults to True. seed ([type], optional): [description]. Defaults to None. """ if self.preprocessed: # only do this once return if verbose: print('Preprocessing tags and movie information...', end='') self.ml_tags.rename(columns={ 'userId': 'userID', 'movieId': 'movieID' }, inplace=True) self.ml_tags = self.ml_tags.astype({'tag': str}) tmp_tags = self.ml_tags.copy() tmp_movies = self.movies.copy() # replace punctuation in tags (a space), movie name (a space), and genres (no space). These will eventually be folded into the tags list # doing it this way to avoid altering the original tags during presentation later tmp_tags['new_tag'] = tmp_tags.tag.str.replace(r'[^\w\s]', ' ') tmp_movies['new_name'] = tmp_movies.name.str.replace(r'[^\w\s]', ' ') tmp_movies['new_genre1'] = tmp_movies.genre1.str.replace( r'[^\w\s]', '') tmp_movies['new_genre2'] = tmp_movies.genre2.str.replace( r'[^\w\s]', '') tmp_movies['new_genre3'] = tmp_movies.genre3.str.replace( r'[^\w\s]', '') # aggregate all users' tags up per movie tags_nostrip = tmp_tags.groupby('movieID').tag.apply( ' '.join).reset_index() tags_nostrip.rename(columns={'tag': 'tags'}, inplace=True) tags_strip = tmp_tags.groupby('movieID').new_tag.apply( ' '.join).reset_index() tags_strip = tags_nostrip.merge(tags_strip, on='movieID') # merge name, genres, and tags together self.movies_merged = tmp_movies.merge(tags_strip, on='movieID', how='left') self.movies_merged['tags_strip'] = self.movies_merged.apply( lambda x: '{} {} {} {} {}'.format( x['new_name'], x['new_genre1'], x['new_genre2'] if type(x['new_genre2']) != float else "", x['new_genre3'] if type(x['new_genre3']) != float else "", x['new_tag']), axis=1) self.movies_merged.drop(columns=[ 'new_tag', 'new_name', 'new_genre1', 'new_genre2', 'new_genre3' ], inplace=True) # merge in the combined tags (with punctuation) self.movies = self.movies.merge(tags_nostrip, on='movieID', how='left') self.preprocessed = True if verbose: print('Done') def train_cosine_similarity(self, seed=None, verbose=True): """[summary] Args: seed ([type], optional): [description]. Defaults to None. verbose (bool, optional): [description]. Defaults to True. Raises: RuntimeError: [description] """ if not self.preprocessed: raise RuntimeError( 'Cannot train cosine similarity until preprocessing is done (via preprocess_tags)' ) if self.trained_cosine: # only do this once return if seed is not None: random.seed(seed) vectorizer = TfidfVectorizer(stop_words='english', min_df=3) if verbose: print('Cosine similarity training...', end='') self.tfidf_matrix = vectorizer.fit_transform( self.movies_merged['tags_strip']) self.trained_cosine = True if verbose: print('Done') def run_nmf(self, n_factors=15, run_cross_validation=True, cv_metric='RMSE', seed=None, verbose=True): """[summary] Args: n_factors (int, optional): [description]. Defaults to 15. run_cross_validation (bool, optional): [description]. Defaults to True. cv_metric (str, optional): [description]. Defaults to 'RMSE'. seed ([type], optional): [description]. Defaults to None. verbose (bool, optional): [description]. Defaults to True. """ # ratings get clipped from 1 to 5 reader = Reader(rating_scale=(1.0, 5.0)) data = Dataset.load_from_df(self.ratings, reader) # first, calculate CV on a fraction of the dataset if run_cross_validation: if verbose: print('Running cross-validation...', end='') if seed is not None: random.seed(seed) algo = NMF(n_factors=n_factors, biased=self.biased, random_state=seed) cv_results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False) avg_cv_result = pd.DataFrame.from_dict(cv_results).mean(axis=0) self.cv_score = avg_cv_result['test_' + cv_metric.lower()] self.cv_fit_time = avg_cv_result['fit_time'] if verbose: print('Done') print('Average CV score: {}\nAverage fit time: {} seconds'. format(round(self.cv_score, 4), round(self.cv_fit_time, 4))) if seed is not None: random.seed(seed) # ratings must have 3 cols: users, items, ratings (in that order) train_set = data.build_full_trainset() self.algo = NMF(n_factors=n_factors, biased=self.biased, random_state=seed) if verbose: print('NMF Fitting...', end='') self.algo.fit(train_set) self.W = self.algo.pu self.H = np.transpose(self.algo.qi) # get predictions for *every* user/movie combo. These will be also compared to the actual ratings if verbose: print('Done') print('Generating all user-movie pairs for predictions...', end='') all_pairs = [(x, y, 0) for x in self.users.userID for y in self.movies.movieID] # getting predictions for ALL user/movie combos # took 40 seconds on 3.4 million rows if verbose: print('Done') print('Calculating predictions on all user-movie pairs...', end='') all_preds = self.algo.test(all_pairs) all_preds = pd.DataFrame([{ 'userID': y.uid, 'movieID': y.iid, 'nmf_prediction': y.est } for y in all_preds]) self.nmf_predictions = all_preds.merge(self.ratings, on=['userID', 'movieID'], how='left') self.nmf_predictions = self.nmf_predictions[[ 'userID', 'movieID', 'rating', 'nmf_prediction' ]] self.trained_nmf = True if verbose: print('Done') def train(self, n_factors=15, run_cross_validation=True, seed=None, verbose=True): """[summary] Args: n_factors (int, optional): [description]. Defaults to 15. run_cross_validation (bool, optional): [description]. Defaults to True. seed ([type], optional): [description]. Defaults to None. verbose (bool, optional): [description]. Defaults to True. """ self.preprocess_tags(verbose=verbose) self.train_cosine_similarity(seed=seed, verbose=verbose) self.run_nmf(n_factors=n_factors, run_cross_validation=run_cross_validation, seed=seed, verbose=verbose) def get_similar_movies(self, movieID, number_of_movies=None, verbose=True): """[summary] Args: movieID ([type]): [description] verbose (bool, optional): [description]. Defaults to True. Raises: RuntimeError: [description] Returns: [type]: [description] """ if not (self.preprocessed and self.trained_cosine): raise RuntimeError( 'Cannot make recommendations without training NMF, preprocessing, and training cosine first.' ) # get the index of the movie idx = np.where(self.movies_merged.movieID == movieID)[0][0] if verbose: print('Getting similar movies to ' + self.movies_merged.iloc[idx]['name'] + '...', end='') y = cosine_similarity(self.tfidf_matrix[idx], self.tfidf_matrix) idx_scores = pd.DataFrame( [(idx, score) for (idx, score) in enumerate(list(y[0])) if score > 0], columns=['idx', 'similarity']) result = pd.concat([ self.movies_merged.iloc[idx_scores.idx].reset_index(), idx_scores ], axis=1).sort_values(by='similarity', ascending=False) # get rid of transformed columns from movies_merged (except tag), and get the *original* name and genres with punctuation result.drop(columns=[ x for x in [*self.movies_merged.columns, 'index', 'idx'] if x != 'movieID' ], inplace=True) result = result.merge(self.movies, on='movieID', how='left') result = result[[ 'movieID', 'name', 'year', 'genre1', 'genre2', 'genre3', 'tags', 'similarity' ]] if verbose: print('Done') # don't include the movie we're finding similarities for if number_of_movies is not None: return result[1:].head(number_of_movies) else: return result[1:] def get_recommendations(self, userID, number_of_recs=5, seed=None, show_user_likes=True, verbose=True): """[summary] Algorithm: 1. Get 20 of the users' top ratings. Start with 5s, if > 20 exist, sample 20 randomly. 2. If fewer than 20 5s exist, sample 4s until get to 20 (or use up all 4s). - If there are no 5s or 4s, ignore the user's ratings, and just return the <number_of_recs> top predicted ratings for this user. Done. 3. For each movie in the top list, calculate cosine similarity, and get the 10 most-similar movies which the user has NOT seen. 4. Combine the 20 most-similar lists of 10 movies into a single list. 5. Remove duplicates from this list, choosing the highest-similarity achieved 6. For each movie, look up the predicted rating for this user. 7. Multiply each movie's similarity times the predicted rating. 8. Return the top <number_of_recs> predicted movies (or all if not enough). Done. Args: userID ([type]): [description] number_of_recs (int, optional): [description]. Defaults to 5. seed ([type], optional): [description]. Defaults to None. verbose (bool, optional): [description]. Defaults to True. Returns: pandas DataFrame: expected ratings. Columns: movieID, name, genres, weighted_rating """ MAX_CONSIDERED_RATINGS = 20 CONSIDER_N_SIMILAR = 10 def combine_genres(df): # combine genres into a single column. Note that NaNs parse as float during apply df['genres'] = df.apply(lambda row: (row['genre1'] if not type(row['genre1'])==float else "") + \ ("/" + row['genre2'] if not type(row['genre2'])==float else "") + \ ("/" + row['genre3'] if not type(row['genre3'])==float else ""), axis=1) df.drop(columns=['genre1', 'genre2', 'genre3'], inplace=True) def get_subset_ratings(): if verbose: print("Getting user's highest rated movies to start from...", end='') all_5s = self.ratings[(self.ratings.userID == userID) & (self.ratings.rating == 5)] if len(all_5s) >= MAX_CONSIDERED_RATINGS: subset_ratings = all_5s.sample(MAX_CONSIDERED_RATINGS, random_state=seed) else: # use all 5s, and add in 4s until we have <MAX_CONSIDERED_RATINGS> subset_ratings = all_5s.copy() all_4s = self.ratings[(self.ratings.userID == userID) & (self.ratings.rating == 4)] count_needed = MAX_CONSIDERED_RATINGS - len(all_5s) subset_ratings = pd.concat([ subset_ratings, all_4s.sample(min(count_needed, len(all_4s)), random_state=seed) ], ignore_index=True) subset_ratings = subset_ratings.merge( self.movies[['movieID', 'name']], on='movieID') if verbose: print('Done') return subset_ratings[['userID', 'movieID', 'name', 'rating']] def get_most_similar_movies(subset_ratings): if verbose: print("Finding similar movies to {} movies the user liked...". format(len(subset_ratings))) seen_movies = list( self.ratings[(self.ratings.userID == userID)].movieID) similar_movies = pd.DataFrame() for movie in subset_ratings.movieID: tmp_similar = self.get_similar_movies(movie, verbose=verbose) # limit to movies the user hasn't seen, and limit to top <CONSIDER_N_SIMILAR> tmp_similar = tmp_similar[~tmp_similar['movieID'].isin( seen_movies)].head(CONSIDER_N_SIMILAR) tmp_similar['similar_to'] = subset_ratings[ subset_ratings['movieID'] == movie].name.values[0] similar_movies = pd.concat([similar_movies, tmp_similar], ignore_index=True) # now remove duplicates, and get the top similarity for each movie similar_movies.sort_values(by='similarity', ascending=False, inplace=True) similar_movies.drop_duplicates(subset='movieID', keep='first', inplace=True) return similar_movies if not (self.trained_nmf and self.preprocessed and self.trained_cosine): raise RuntimeError( 'Cannot make recommendations without training NMF, preprocessing, and training cosine first.' ) if userID not in self.users.userID.values: raise ValueError( 'User {} does not exist in ratings dataset. If this is a new user, create a new user using the average ratings.' .format(userID)) if seed is not None: random.seed(seed) review_counts = self.ratings[self.ratings.userID == userID].rating.value_counts() if review_counts.get(5, 0) + review_counts.get(4, 0) == 0: # ignore user's ratings, and just get the user's top <number_of_recs> ratings if verbose: print( "User has no ratings >= 4. Ignoring user's ratings, returning top predicted ratings." ) # get only predicted ratings for ones the user hasn't seen subset_ratings = self.nmf_predictions.loc[ (self.nmf_predictions.userID == userID) & (self.nmf_predictions.rating.isna())].copy() subset_ratings = subset_ratings.merge(self.movies, on='movieID') combine_genres(subset_ratings) # add in columns that would have been calculated subset_ratings['similar_to'] = "" subset_ratings['similarity'] = np.nan subset_ratings['weighted_rating'] = subset_ratings[ 'nmf_prediction'] # reorder columns subset_ratings = subset_ratings[[ 'movieID', 'name', 'year', 'genres', 'tags', 'similar_to', 'similarity', 'nmf_prediction', 'weighted_rating' ]] subset_ratings.sort_values(by='nmf_prediction', ascending=False, inplace=True) return subset_ratings.head(number_of_recs) # get up to <MAX_CONSIDERED_RATINGS> 5s subset_ratings = get_subset_ratings() if show_user_likes: print('\n---------------\nHighest-reviewed movies for userID {}:'. format(userID)) print(subset_ratings) print('\n---------------\n') # get the similarity for each movie in subset_ratings similar_movies = get_most_similar_movies(subset_ratings) # now we have the similarity scores for the movies most like the movies the user rated highest # get the predicted ratings, and multiply those by the similarity scores if verbose: print( "Getting user's predicted ratings and calculated expected rating...", end='') user_predictions = self.nmf_predictions[self.nmf_predictions['userID'] == userID] similar_movies = similar_movies.merge(user_predictions, on='movieID', how='inner') similar_movies['weighted_rating'] = similar_movies[ 'similarity'] * similar_movies['nmf_prediction'] if verbose: print('Done') print("Finalizing output...", end='') # combine genres and reorder columns combine_genres(similar_movies) similar_movies = similar_movies[[ 'movieID', 'name', 'year', 'genres', 'tags', 'similar_to', 'similarity', 'nmf_prediction', 'weighted_rating' ]] similar_movies.sort_values(by='weighted_rating', ascending=False, inplace=True) if verbose: print('Done') return similar_movies.head(number_of_recs)
all_predictions = {} all_predictions['uid'] = {} all_predictions['uid']['iid'] = 'est' for uid, iid, true_r, est, _ in predictions: all_predictions[uid] = {} all_predictions[uid][iid] = est # all_predictions[uid] = svd_movie_matrix = movie_matrix.fillna(all_predictions[uid][iid]) # NMF - non negative matrix factorization nmfAlgo = NMF() nmfAlgo.fit(trainset) nmf_predictions = nmfAlgo.test(testset) # print(nmf_predictions) all_nmf_predictions = {} all_nmf_predictions['uid'] = {} all_nmf_predictions['uid']['iid'] = 'est' for uid, iid, true_r, est, _ in nmf_predictions: all_nmf_predictions[uid] = {} all_nmf_predictions[uid][iid] = est nmf_movie_matrix = movie_matrix.fillna(all_nmf_predictions[uid][iid]) # evaluate(algo, data, measures=['RMSE']) if __name__ == '__main__': app.run(debug=True)
for i in range(len(testset)): if (testset[i][1] not in to_delete): trimmed_testset.append(testset[i]) total_trimmed.append(trimmed_testset) rmse_mean = [] for i in range(2, 52, 2): print("%d---------------------------------" % i) algo = NMF(n_factors=i, random_state=1) for j in range(0, len(total_train)): algo.fit(total_train[j]) predictions = algo.test(total_trimmed[j]) # Compute and print Root Mean Squared Error mean = [] mean.append(accuracy.rmse(predictions, verbose=True)) rmse_mean.append(np.mean(mean)) rmse_mean_array = np.asarray(rmse_mean) print('============================================') print('Question 19:') fig = plt.figure() plt.plot(range(2, 52, 2), rmse_mean, 'b-') plt.xlabel('K Value') plt.ylabel('RMSE') plt.show()
data.raw_ratings = train_raw_ratings # Finds best parameters for NMF model with bias # Scores using MSE params = {"biased": [True], "n_factors": np.arange(2, 12, 2)} nmf = GridSearchCV(NMF, params, measures=["mse"], cv=3) nmf.fit(data) print("\nBest number of factors found:", nmf.best_params['mse']['n_factors']) # Trains NVM using best parameters found best_nmf = NMF(biased=True, n_factors=nmf.best_params['mse']['n_factors']) best_nmf.fit(data.build_full_trainset()) # Tests on training set predictions = best_nmf.test(data.build_full_trainset().build_testset()) mse = accuracy.mse(predictions, verbose=False) print("Training Set MSE:", mse) # Scores on test set predictions = best_nmf.test(data.construct_testset(test_raw_ratings)) mse = accuracy.mse(predictions, verbose=False) print("Test Set MSE:", mse) # Checks Recommendations # ----------------------- recs = defaultdict(list) # List of recommendations for each user num_recs = 5 # Number of recommendations to get for each user for uid, iid, true_r, est, _ in predictions: recs[uid].append((iid, est))
def compute_recommendations(): #connecting to the database # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True) engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) blockPrint() #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() #formatting the dataset using the surprise library reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_df(df_ratings, reader=reader) training_set = data.build_full_trainset() algorithm = NMF() # use the singular value decomposition algorithm.train(training_set) # fit the data to the model testing_set = training_set.build_anti_testset() predictions = algorithm.test(testing_set) # make prediction #writing the function for top predictions def get_top_n(predictions, n=10): # Return the top-N recommendation for each user from a set of predictions. # Args: # predictions(list of Prediction objects): The list of predictions, as # returned by the test method of an algorithm. # n(int): The number of recommendation to output for each user. Default # is 10. # Returns: # A dict where keys are user (raw) ids and values are lists of tuples: # [(raw item id, rating estimation), ...] of size n. # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # getting the top 10 predictions top_n = get_top_n(predictions, n=10) # Print the recommended items for each user a = [] for uid, user_ratings in top_n.items(): a.append([uid, [iid for (iid, _) in user_ratings]]) df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B']) df_user = pd.DataFrame(df_list_pred.A.values.tolist()) df_pred = pd.DataFrame(df_list_pred.B.values.tolist()) df_pred.columns = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_items = pd.read_sql('SELECT * FROM items;', con=engine) # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title']) df_pred[['id']] = df_user df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) # Append recomemndations df_pred.to_sql('recommendations', engine, if_exists='append', index=False) #if_exists='append' session.commit() #logging the predictions df_log = df_pred df_log['algorithm'] = 'NMF' df_log = df_log.rename(columns={'id': 'user_id'}) df_log = df_log[[ 'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm' ]] df_log.to_sql('predictionlogs', engine, if_exists='append', index=False) #if_exists='append' session.commit() global mae2 global rmse2 mae2 = accuracy.mae(predictions) rmse2 = accuracy.rmse(predictions) mae2 = float(mae2) rmse2 = float(rmse2)
thres = 3 file_path = os.path.expanduser('ratings.csv') reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data_raw = Dataset.load_from_file(file_path, reader=reader) train, test = train_test_split(data_raw, test_size=0.1) sim_options = {'name': 'pearson', 'user_based': True} algo1 = KNNWithMeans(k = 24, sim_options=sim_options) # !!!!!! need to revise after part 4 done algo2 = NMF(n_factors = 18, random_state = 1) algo3 = SVD(n_factors = 16, random_state = 1, biased=True) algo1.fit(train) algo2.fit(train) algo3.fit(train) predictions1 = algo1.test(test) predictions2 = algo2.test(test) predictions3 = algo3.test(test) target1 = [] for i in range(len(test)): target1.append(test[i][2]) for t in range(len(target1)): if target1[t] > thres: target1[t]=1 else: target1[t]=0 target2 = [] for i in range(len(test)): target2.append(test[i][2])
# define a cross-validation iterator kf = KFold(n_splits=10) for k in k_values: algo = NMF(n_factors=k) this_k_RMSE = list() this_k_MAE = list() # use cross-validation iterator, perform CV manually for trainset, testset in kf.split(data): # fit the whole trainset algo.fit(trainset) #trim testset here testset_df = pd.DataFrame(testset, columns=['userId', 'movieId', 'rating']) testset_popular = testset_df.groupby("movieId").filter( lambda x: len(x) > 2).values.tolist() # testset_unpopular = testset_df.groupby("movieId").filter(lambda x: len(x) <= 2).values.tolist() # testset_highvariance =testset_df.groupby("movieId").filter(lambda x: np.var(x['rating'])>=2 and len(x)>=5 ).values.tolist() predictions = algo.test(testset_popular) this_k_RMSE.append(accuracy.rmse(predictions, verbose=True)) RMSE.append(np.mean(this_k_RMSE)) plt.figure(figsize=[6, 5]).set_tight_layout(True) plt.plot(k_values, RMSE, label='RMSE') plt.xlabel('k') plt.legend(loc="upper right")
# Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) # Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item. algoritmo = NMF(n_epochs=5) algoritmo.fit(trainset) # Selecionamos o usuário e o filme que será analisado # User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas uid = str(49) # Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4 iid = str(2058) # raw item id # get a prediction for specific users and items. pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algoritmo.test(testset) # Avalia RMSE print("Avaliação RMSE: ") accuracy.rmse(test_pred, verbose=True) # Avalia MAE print("Avaliação MAE: ") accuracy.mae(test_pred, verbose=True)
print("KNNMean") knnmean = KNNWithMeans(sim_options={"name": "cosine"}) knnmean.fit(trainset) knnmean_predictions = knnmean.test(testset) results = get_group_measures(preds_all=knnmean_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U3_users) knnmean_results.append(results) print("NMF") nmf = NMF() nmf.fit(trainset) nmf_predictions = nmf.test(testset) results = get_group_measures(preds_all=nmf_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U3_users) nmf_results.append(results) """print("TOP") top = TOP() top.fit(trainset) top_predictions = top.test(testset) results = get_group_measures(preds_all=top_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U4_users) top_results.append(results) print("NormalPredictor") rand = NormalPredictor()
def matrix_factorize_for_latent_feats(): """ Function employs matrix factorization (NMF) of user scores for the episodes they've seen. Typically, this type of recommender algorithm aims to predict reviewer scores for episodes that users *haven't* seen, with user and episode latent features as a byproduct. This function will return a dataframe of episodes strongest in each latent feature instead. Inputs: --- None as yet. This code should just run once the function is called. Output: --- *matfact_lf_recs: Pandas DataFrame; DF containing the top episodes for each latent feature (as determined by the algorithm) """ # First, we must build our reviewer DFs from which we will factorize user ratings: rev_rvr_df = pd.read_pickle("data/combined_rev_rvr_df.pkl.bz2") # Drop duplicate values in reviews and reviewer DF: rev_rvr_df.drop_duplicates(inplace=True) summary_df = pd.read_pickle("data/combined_info_rev_df.pkl.bz2") # Here, we create dictionaries for episode IDs and reviewer IDs to be applied to the recommendation DFs we'll be creating below: id_ep = dict(enumerate(rev_rvr_df["ep_title"].unique())) ep_id = dict((values, keys) for keys, values in id_ep.items()) id_rvr = dict(enumerate(rev_rvr_df["reviewer"].unique())) rvr_id = dict((values, keys) for keys, values in id_rvr.items()) # Append reviewer and episode IDs to reviews DF using the dictionaries created above: rev_rvr_df["rvr_id"] = None rev_rvr_df["ep_id"] = None rev_rvr_df.loc[:, 'ep_id'] = rev_rvr_df["ep_title"].map( lambda x: ep_id.get(x, np.nan)) rev_rvr_df.loc[:, "rvr_id"] = rev_rvr_df["reviewer"].map( lambda x: rvr_id.get(x, np.nan)) # Fill empty reviewer scores with mean values so we can create our utility matrix and so the NMF algo can work without throwing errors: rev_rvr_df["rvr_rating"].fillna(value=rev_rvr_df["rvr_rating"].mean(), inplace=True) # Creating reader object to read in our DF. This is a necessary step to create the Dataset object from our DF that the Surprise library likes to work with: reader = Reader(rating_scale=(1.0, 10.0)) data = Dataset.load_from_df(rev_rvr_df[["rvr_id", "ep_id", "rvr_rating"]], reader=reader) # Splitting our data into training and testing sets for the algorithm: trainset, testset = train_test_split(data, test_size=0.2) # Creating and fitting our NMF algorithm with optimal hyperparameters (per gridsearching at a previous date): optimal_nmf = NMF(n_factors=20, n_epochs=100, reg_bi=0.15, reg_qi=0.3, lr_bu=0.005, lr_bi=0.001, biased=True, verbose=False) optimal_nmf.fit(trainset) # Brief aside to check our error metrics, for those curious: predicted_nmf = optimal_nmf.test(testset) print(accuracy.mae(predicted_nmf)) print(accuracy.rmse(predicted_nmf)) # Now we create a DF for ALL episodes and their latent feature scores in the columns: latent_feat = pd.DataFrame(optimal_nmf.qi) # Append episode titles to the big latent_feat DF using the id_ep dictionary we created above (as a reminder: that's the dictionary where keys are episode indices in the big latent_feat DF and vals are episode titles): latent_feat["ep_title"] = pd.Series(id_ep) # Creating a list of episodes that have the highest value for each latent feature so that we can create a separate DF of just these episodes: max_lf_vals = list(latent_feat.iloc[:, :-1].max()) # Here, we create a new DF to contain ONLY the episodes that are highest in each latent feature: max_lf_df = pd.DataFrame(columns=latent_feat.columns) for col_idx, vals in enumerate(max_lf_vals): temp_df = pd.DataFrame(latent_feat[latent_feat.loc[:, col_idx] == vals]) max_lf_df = pd.concat([max_lf_df, temp_df]) # Appending series-season-episode identifiers, IMDb user rating, and episode titles from the summary_df by merging on episode titles (now that both have ep titles): max_lf_df = max_lf_df.merge(summary_df[[ "series", "season", "episode", "ep_title", "identifier", "IMDB_user_rating" ]], on="ep_title") matfact_lf_recs = max_lf_df.loc[:, [ "series", "season", "episode", "ep_title", "identifier", "IMDB_user_rating" ]] return matfact_lf_recs
#knnoptm=30 #nnmfoptm=20 #mfoptm=8 trainset, testset = train_test_split(data, test_size=0.1) #def make_prediction(filter_type): #if filter_type=='KNN': #sim_options={'name': 'pearson','user_based':True} algo = NMF(n_factors=20) #elif filter_type='NNMF': #filter_=NMF(n_factors=20) #elif filter_type='MFbias': #filter_=SVD(n_factors=8,biased=True) #filter_.fit(trainset) algo.fit(trainset) predictions = algo.test(testset) truth_table = np.array([prediction.r_ui for prediction in predictions]) truth_table_copy = truth_table.copy() truth_table_copy[truth_table < 3] = 0 truth_table_copy[truth_table >= 3] = 1 user_dic = defaultdict(list) for uid, _, tru_r, est, _ in predictions: user_dic[uid].append((est, tru_r)) print 'dic created' def precision_recall(predictions, t): precision = dict() recall = dict()
class MovieRecommender: def __init__(self): self._knn = None self._nmf = None self._trainset = None self._predictions = None self.initialized = False def initialize(self, data_filepath): self._data = Dataset.load_from_file(data_filepath, reader=Reader('ml-100k')) self._trainset = self._data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} self._knn = KNNBaseline(sim_options=sim_options) self._nmf = NMF() start_new_thread(self._train) def get_similar_movies(self, movie_id, k=10): if not self.initialized: return [] model = self._knn movie_inner_id = model.trainset.to_inner_iid(movie_id) similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k) to_raw_iid = model.trainset.to_raw_iid similar_movie_ids = (to_raw_iid(inner_id) for inner_id in similar_movie_inner_ids) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def get_similar_movies_for_user(self, user_id, num_movies=10): if not self.initialized: return [] user_id = str(user_id) user_predictions = [ prediction for prediction in self._predictions if prediction[0] == user_id ] sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True) top_n_predictions = sorted_predictions[:num_movies] similar_movie_ids = (prediction.iid for prediction in top_n_predictions) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def update_user_ratings(self, user_id, movie_id, rating): if not self.initialized: return rating = float(rating) has_previous_rating = False if self._trainset.knows_user(user_id): trainset_dict = dict(self._trainset.ur[user_id]) has_previous_rating = movie_id in trainset_dict user_id = str(user_id) movie_id = str(movie_id) new_rating = (user_id, movie_id, rating, time()) if has_previous_rating: for i, rating in enumerate(self._data.raw_ratings): if rating[0] == user_id and rating[1] == movie_id: self._data.raw_ratings[i] = new_rating break else: self._data.raw_ratings.append(new_rating) self._trainset = self._data.build_full_trainset() self._train() def _train(self): self._nmf.train(self._trainset) self._knn.train(self._trainset) self._predictions = self._nmf.test(self._trainset.build_anti_testset()) self.initialized = True
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) print (score_df) print (svd_data.raw_ratings) #Try SVD algo_svd = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svd.fit(full_train_set) predictions = algo_svd.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) latent_usr_factor = algo_svd.pu latent_item_factor = algo_svd.qi user_bias = algo_svd.bu item_bias = algo_svd.bi recomendation_reportname_df_svd = pd.merge(recommendation_df_svd, df_reports_id, how = 'left', on= 'report_id') #Try SVD++ algo_svdpp = SVDpp() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svdpp.fit(full_train_set) predictions = algo_svdpp.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svdpp = get_top_n(predictions, n=5) latent_usr_factor_pp = algo_svd.pu latent_item_factor_pp = algo_svd.qi user_bias_pp = algo_svd.bu item_bias_pp = algo_svd.bi recomendation_reportname_df_svdpp = pd.merge(recommendation_df_svdpp, df_reports_id, how = 'left', on= 'report_id') #Try SVD++ with more factors as default is 20 algo_svdpp_mod = SVDpp(n_factors =50, n_epochs = 50) full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svdpp.fit(full_train_set) predictions = algo_svdpp.test(test_set) # Then compute RMSE accuracy.rmse(predictions) print (score) #print (recommendation_df) #Try the NMF #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo_nmf = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_nmf.fit(full_train_set) predictions = algo_nmf.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_nmf = get_top_n(predictions, n=5) #print (recommendation_df) latent_usr_factor_nmf = algo_svd.pu latent_item_factor_nmf = algo_svd.qi user_bias_nmf = algo_svd.bu item_bias_nmf = algo_svd.bi recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id') sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816] #Try the NMF without default #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo_nmf_mod = NMF(n_factors =50, n_epochs = 50) full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True, ) # Fitting the SVD algo_nmf.fit(full_train_set) predictions = algo_nmf.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_nmf = get_top_n(predictions, n=5) #print (recommendation_df) latent_usr_factor_nmf = algo_svd.pu latent_item_factor_nmf = algo_svd.qi user_bias_nmf = algo_svd.bu item_bias_nmf = algo_svd.bi recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id') sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816] #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise # Matrix Factorization Based Algorithms svd_cv = cross_validate(algo_svd, svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(algo_svdpp,svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(algo_nmf, svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv_mod = cross_validate(algo_svdpp_mod,svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv_mod = cross_validate(algo_nmf_mod, svd_data, cv=5, n_jobs=5, verbose=False)
from surprise import NMF from surprise.model_selection import train_test_split from surprise import Dataset from surprise import Reader reader = Reader(line_format='user item rating', sep=',' ,rating_scale=(0,10) ) data = Dataset.load_from_file('user_ratings1.data', reader=reader) data.raw_ratings trainset, testset = train_test_split(data, test_size = .7, train_size = .3, random_state = 1) algo = NMF(n_factors=25, n_epochs=200, random_state=1) #trainset = data.build_full_trainset() algo.fit(trainset) from surprise import accuracy pred = algo.test(testset) accuracy.rmse(pred), accuracy.mae(pred) for i in range(20): print('user: %-10s item: %-10s r_ui: %-10s est: %.2f %-10s' % (pred[i].uid, pred[i].iid, pred[i].r_ui , pred[i].est , pred[i][-1]))
def surpriseNMF(mode, DataPath='../data/data_clean.txt', TrainPath='../data/train_clean.txt', TestPath='../data/test_clean.txt', n_factors=15, n_epochs=50, reg_pu=0.06, reg_qi=0.06, reg_bu=0.02, reg_bi=0.02, lr_bu=0.005, lr_bi=0.005, init_low=0, init_high=1, biased=False, verbose=True): # We need the rating scale. reader = Reader(rating_scale=(1, 5)) if mode == 'evaluation': # train data processing train = pd.read_csv(TrainPath, sep="\t", header=None) train.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(train[["User Id", "Movie Id", "Rating"]], reader) trainset = data.build_full_trainset() # fit model algo = NMF(n_factors=n_factors, n_epochs=n_epochs, reg_pu=reg_pu, reg_qi=reg_qi, reg_bu=reg_bu, reg_bi=reg_bi, lr_bu=lr_bu, lr_bi=lr_bi, init_low=init_low, init_high=init_high, biased=biased, verbose=verbose) algo.fit(trainset) # evaluate train error test = trainset.build_testset() predictions = algo.test(test) train_err = accuracy.rmse(predictions, verbose=False) # test data processing test = pd.read_csv(TestPath, sep="\t", header=None) test.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(test[["User Id", "Movie Id", "Rating"]], reader) testset = data.build_full_trainset() # evaluate train error test = testset.build_testset() predictions = algo.test(test) test_err = accuracy.rmse(predictions, verbose=False) # Return V (qi), U (pu), train_err (RMSE), test_err (RMSE) return algo.qi, algo.pu, train_err, test_err elif mode == 'visualization': # train data processing alldata = pd.read_csv(DataPath, sep="\t", header=None) alldata.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(alldata[["User Id", "Movie Id", "Rating"]], reader) trainset = data.build_full_trainset() # fit model algo = NMF(n_factors=n_factors, n_epochs=n_epochs, reg_pu=reg_pu, reg_qi=reg_qi, reg_bu=reg_bu, reg_bi=reg_bi, lr_bu=lr_bu, lr_bi=lr_bi, init_low=init_low, init_high=init_high, biased=biased, verbose=verbose) algo.fit(trainset) # evaluate train error test = trainset.build_testset() predictions = algo.test(test) train_err = accuracy.rmse(predictions, verbose=False) U = algo.pu V = algo.qi A, _, B = np.linalg.svd(V.T) A = A.T # Use the first 2 cols for work Asub = A[:, :2] Uproj = np.dot(Asub.T, U.T) Vproj = np.dot(Asub.T, V.T) # Return Vproj, Uproj, train_err (RMSE of Y = U^T V) return Vproj, Uproj, train_err
def main(rec='SVD', threshold=4, topK=10): # First train an SVD algorithm on the movielens dataset. print("load data...") ''' data = Dataset.load_builtin('ml-1m') # test set is made of 40% of the ratings. test_size = 0.4 trainset, testset = train_test_split(data, test_size=test_size) ''' # path to dataset file test_data_path = r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data' #这个还不知道干嘛用 file_path = os.path.expanduser( r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data') reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() test_user, test_item, test_rate = read_data(test_data_path) #分为三组 #print("test size %.1f..." % test_size) print("training...") sim_options = { 'name': 'cosine', 'user_based': False # 计算物品相似度 } #选择算法 if rec == 'NMF': algo = NMF() elif rec == 'SVD': algo = SVD() name = ['SVD'] else: algo = KNNBaseline(sim_options=sim_options) name = ['ItemKNN'] train_start = time.time() algo.fit(trainset) train_end = time.time() print('train time:%.1f s' % (train_end - train_start)) #Than predict ratings for all pairs (u, i) that are NOT in the training set. ######填充空值,预测trainset的值 testset = trainset.build_anti_testset() predictions = algo.test(testset) test_end = time.time() print('test time:%.1f s' % (test_end - train_end)) #top_n_est 是元组列表,元组里边是itemid 和 对应预测评分 top_n_est, true_ratings = get_top_n(predictions, n=10, threshold=threshold) #模型评估 f1, map, mrr, mndcg = evaluate_model_new(algo, test_user, test_item, test_rate, topK) eval_end = time.time() print('evaluate time:%.1f s' % (eval_end - test_end)) print("algorithm : %s" % rec) print( 'recommendation metrics: F1 : %0.4f, NDCG : %0.4f, MAP : %0.4f, MRR : %0.4f' % (f1, mndcg, map, mrr)) print('%0.4f个用户' % algo.pu.shape) print('%0.4f个物品' % algo.qi.shape) return top_n_est