def __init__(self, data): self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=1) LOOX = LeaveOneOut(1, random_state=1) for xtrain, xtest in LOOX.split(data): self.LOOX_trainSet = xtrain self.LOOX_testSet = xtest del xtrain, xtest self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset() self.full_trainSet = data.build_full_trainset() self.full_antitestSet = self.full_trainSet.build_anti_testset()
def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset, testset = train_test_split(data, test_size=.20) isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) conn.close() #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) return round(accuracy.rmse(predictions, verbose=False), 4)
def __init__(self, data): self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=1) LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def __init__(self, data, popularityRankings): self.rankings = popularityRankings self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def test_svd(data): reader = Reader(rating_scale=(1, 5)) svd_data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(svd_data, test_size=.10, random_state=24) svd_model = SVD(n_factors=150, n_epochs=20, lr_all=0.008, reg_all=0.1, random_state=24) svd_model.fit(trainset) predictions = svd_model.test(testset) test_mse = accuracy.mse(predictions, verbose=False) return test_mse
def binary_value(data, threshold) : trainset, testset = train_test_split(data, test_size=.1) algo = KNNWithMeans(k = 30) algo.fit(trainset) predictions = algo.test(testset) like0 = []#real like = []#predict for row in range(len(predictions)) : like.append( 1 if predictions[row][3] > threshold else 0) like0.append(1 if predictions[row][2] > threshold else 0) #predictions[row][3] -> predict value #predictions[row][2] -> real value return like0, like
def split_train_test(self, surprise_dataset: Optional[Dataset] = None, test_size: Optional[float] = None, inplace: Optional[bool] = True): # -> Optional[Tuple] if surprise_dataset is None: surprise_dataset = self.surprise_dataset if test_size is None: test_size = self.test_size trainset, testset = train_test_split(surprise_dataset, test_size) if inplace: self.trainset = trainset self.testset = testset else: return trainset, testset
def chose_yahoo(file_path): # mae= [] # rmse = [] reader = Reader(line_format='timestamp user item rating', sep='\t')#timestamp #载入数据,包括多准则评分:故事,角色,表演,画面,音乐,以及整体评分 story = Dataset.load_from_file(file_path + 'story.txt', reader=reader) role = Dataset.load_from_file(file_path + 'role.txt', reader=reader) show = Dataset.load_from_file(file_path + 'show.txt', reader=reader) image = Dataset.load_from_file(file_path + 'image.txt', reader=reader) music = Dataset.load_from_file(file_path + 'music.txt', reader=reader) total = Dataset.load_from_file(file_path + 'total.txt', reader=reader) # print('载入数据成功!\n') #按2:8拆分数据 random_states = 180 story_train, story_test = train_test_split(story, random_state = random_states) role_train, role_test = train_test_split(role, random_state = random_states) show_train, show_test = train_test_split(show, random_state = random_states) image_train, image_test = train_test_split(image, random_state = random_states) music_train, music_test = train_test_split(music, random_state = random_states) total_train, total_test = train_test_split(total, random_state = random_states) # print('数据划分成功!\n') #选择的是基于项目的协同过滤算法,项目相似度计算采用cosine方法 sim_options = {'name': 'pearson',#用皮尔森基线相似度避免出现过拟合 'user_based': False} # 基于用户的协同过滤算法 algo1 = KNNWithMeans(sim_options=sim_options) algo2 = KNNWithMeans(sim_options=sim_options) algo3 = KNNWithMeans(sim_options=sim_options) algo4 = KNNWithMeans(sim_options=sim_options) algo5 = KNNWithMeans(sim_options=sim_options) algo6 = KNNWithMeans(sim_options=sim_options) algo1.fit(story_train) algo2.fit(role_train) algo3.fit(show_train) algo4.fit(image_train) algo5.fit(music_train) algo6.fit(total_train) story_p = algo1.test(story_test) role_p = algo2.test(role_test) show_p = algo3.test(show_test) image_p =algo4.test(image_test) music_p = algo5.test(music_test) single_p = algo6.test(total_test) # rmse.append(accuracy.rmse(single_p)) #平均法 # multi_p = avg(story_p, role_p, show_p, image_p, music_p, single_p) #整体回归 P = combine(story_p, role_p, show_p, image_p, music_p, single_p) df = pd.read_csv(file_path + 'all.txt', sep = '\t', names = ['id', 'uid', 'mid', 'total', 'story', 'role', 'show', 'image', 'music']) k, b = totalRegModel(df) multi_p = totalReg(P, k, b, single_p) #基于每个用户的回归 mae = (accuracy.mae(single_p),accuracy.mae(multi_p)) # rmse.append(accuracy.rmse(multi_p)) return mae#, rmse
def new_recommendations(df, new_ratings): df = df[['user_id', 'isbn', 'rating']] new_ratings = pd.DataFrame(new_ratings)[['user_id', 'isbn', 'rating']] new_df = pd.concat([df, new_ratings]).reset_index(drop=True) reader = Reader(rating_scale=(1,5)) data = Dataset.load_from_df(new_df,reader) train, test = train_test_split(data, test_size=.2) model = SVD(n_epochs=17, lr_all=.015, reg_all=.125, n_factors=17) model.fit(train) preds = model.test(test) user_id = new_ratings.user_id[0] book_list = [] for x in new_df.isbn.unique(): book_list.append((x, model.predict(user_id,x)[3])) ranked_books = sorted(book_list, key=lambda x: x[1], reverse=True) return ranked_books
def __init__(self,data,withSim=False): self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=0) LOOX = LeaveOneOut(1, random_state=1) for xtrain, xtest in LOOX.split(data): self.LOOX_trainSet = xtrain self.LOOX_testSet = xtest del xtrain, xtest self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset() self.full_trainSet = data.build_full_trainset() self.full_antitestSet = self.full_trainSet.build_anti_testset() if withSim: sim_options = {'name': 'cosine', 'user_based': False} self.simAlgo = KNNBaseline(sim_options=sim_options) self.simAlgo.fit(self.full_trainSet)
def surprise_build_train_test(data_frame, reader, full=True): """Returns tuple of `surprise` full train set and test set. Args: data_frame: Pandas data-frame. reader: `surprise` `Reader` instance. full: Whether to build full trainset or split one """ raw_data = Dataset.load_from_df(data_frame, reader=reader) if full: train_set = raw_data.build_full_trainset() test_set = train_set.build_testset() return train_set, test_set return train_test_split(raw_data, test_size=0.2)
def train(data): reader = Reader(rating_scale=(0, 9)) data = Dataset.load_from_df(data[['userID', 'ISBN', 'bookRating']], reader) print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo = BaselineOnly(bsl_options=bsl_options) print(cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)) trainset, testset = train_test_split(data, test_size=0.25) algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) print(accuracy.rmse(predictions)) def get_Iu(uid): """ return the number of items rated by given user args: uid: the id of the user returns: the number of items rated by the user """ try: return len(trainset.ur[trainset.to_inner_uid(uid)]) except ValueError: # user was not part of the trainset return 0 def get_Ui(iid): """ return number of users that have rated given item args: iid: the raw id of the item returns: the number of users that have rated the item. """ try: return len(trainset.ir[trainset.to_inner_iid(iid)]) except ValueError: return 0 df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) df['Iu'] = df.uid.apply(get_Iu) df['Ui'] = df.iid.apply(get_Ui) df['err'] = abs(df.est - df.rui) best_predictions = df.sort_values(by='est')[:10] worst_predictions = df.sort_values(by='err')[-10:] print(best_predictions) print(worst_predictions)
def predict(): global top_n global user_id print("--predict start--------------------------------") # request data = pd.DataFrame(get_default_ratings()) reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(df=data, reader=reader) trainset_2, testset_2 = train_test_split(data, test_size=0.3) algo = SVDpp() predictions = algo.fit(trainset_2).test(testset_2) top_n = get_top_n(predictions, n=10)
def main(): row_num = 5000 #reading the important ratings file to make it a pandas dataframe in order to be used by surprise ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num) #define the document's columns ratings_data.columns = ['userId', 'songId', 'rating'] #read the csv where it is the songs data song_data = open('datasets/song_data.csv', 'rt') c_reader = csv.reader(song_data, delimiter=',', quotechar='|') #create a hash where we will store the important info from all songs song_dict = {} #update the hash, example #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple'] for row in c_reader: song_dict.update({row[0]: [row[1], row[3]]}) #surprise reader, define the rating scale to use reader = Reader(rating_scale=(1,100)) #transform info to a surprise dataset data = Dataset.load_from_df(ratings_data, reader) #split data into training and testSet training_set, testSet = train_test_split(data, test_size=.25) #define the algorithm to use knn = KNNBasic(name="cosine", user_based=False) #train the algorithm knn.fit(training_set) print("Done training") print("Test set length", len(testSet)) print("testing") #make predictions predictions = knn.test(testSet) print("getting recommendations") #measure accuracy, Compute FCP (Fraction of Concordant Pairs). accuracy.fcp(predictions) #get top n predictions top_n = get_top_n(predictions,4) file = open('predictions.txt', 'w') for uid, user_ratings in top_n.items(): file.write("prediction for " +str(uid) +":\n") result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings] for item in result_array: file.write("\t") file.write('-'.join(item)) file.write("\n") #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n") file.close()
def __init__(self, data, popularityRankings, movies): self.rankings = popularityRankings self.movies = movies print(len(movies), movies) #Build a full training set for evaluating overall properties self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() #Build a 75/25 train/test split for measuring accuracy self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options)
def perform_operation(self): self.LOG_HANDLE.info( "Running the collaborative filtering algorithms...") latest_ratings_file_name = self.get_latest_output_file_name( configurations.RATINGS_FILE_IN_REQUIRED_FORMAT_FILE_NAME, next=False)[1] latest_ratings_file_location = os.path.join( configurations.OUTPUT_FILES_DIRECTORY, latest_ratings_file_name) self.LOG_HANDLE.info("Running recommender models on the file here: " + latest_ratings_file_location) print("Running all recommender models") # Params from here: http://surprise.readthedocs.io/en/stable/reader.html reader = Reader(sep=constants.COMMA_STR) # Params from here: http://surprise.readthedocs.io/en/stable/dataset.html ratings_dataset = Dataset.load_from_file(latest_ratings_file_location, reader) # Divide the data set into the training and test sets trainset, testset = train_test_split( ratings_dataset, test_size=model_params.test_set_size) # Add different algorithms here - Removed SVD PP algorithm collaborative_algorithms = [ normal_algo_wrapper(), knn_algo_wrapper(), svd_algo_wrapper() ] rmse_values = {} for collaborative_algorithm in collaborative_algorithms: print("Started Algorithm: " + collaborative_algorithm.algo_name) rmse_values[collaborative_algorithm. algo_name] = collaborative_algorithm.evaluate_on_test( trainset, testset) collaborative_algorithm.perform_grid_search_with_cv( ratings_dataset) print("Completed Algorithm: " + collaborative_algorithm.algo_name) print("All recommender models have been run...") plt.scatter(rmse_values.keys(), rmse_values.values()) plt.xlabel('Collaborative filtering algorithm') plt.ylabel('Root mean square error (RMSE) on test predictions') plt.show()
def plot_all_ROC(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p/rang) fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = metrics.auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area): fpr = fp[f] tpr = tp[t] plt.plot(fpr, tpr, lw=lw, label='%s'%mod) plt.plot([0, 1], [0, 1], lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend() plt.show()
def train_test(data): """ 分割训练集和测试集 如果不想运行完整的交叉验证过程,则可以使用 train_test_split() 来给定大小的训练集和测试集采样. 您将需要使用将在训练集上训练算法的方法, 以及将返回从测试集得出的预测的方法:accuracy metric fit()test() :return: """ trainset, testset = train_test_split(data, test_size=0.25) algo = SVD() algo.fit(trainset) #预测 predictions = algo.test(testset) #准确率 print(accuracy.rmse(predictions)) return None
def GetAccuracy(): d = Data() data = d.loadData() trainSet = data.build_full_trainset() _, testSet = train_test_split(data, test_size=.25, random_state=1) model = KNNBasic(sim_options=sim_options, verbose=False) model.fit(trainSet) predictions = model.test(testSet) mae = accuracy.mae(predictions, verbose=False) rmse = accuracy.rmse(predictions, verbose=False) return mae, rmse
def SVDFun(data, userSet, movieSet, userID): # Evaluate performances of our algorithm on the dataset. # itemList = [[0] * userNumber] * itemNumber # userList = [[0] * itemNumber] * userNumber algo = SVD() # perf = evaluate(algo, data, measures=['RMSE', 'MAE']) # trainset = data.build_full_trainset() trainset, testset = train_test_split(data, test_size=.25) # algo.fit(trainset) # predictions = algo.test(testset) algo = joblib.load('svdmodel.pkl') # meanRMSE = average(perf['RMSE']) # meanMAE = average(perf['MAE']) movielist = dict() for movie in movieSet: est = algo.predict(userID, movie).est movielist[movie] = est return movielist
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False): print("\n###### Compute CollaborativeFiltering_User_User ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) # compute similarities between items sim_options = {'name': 'cosine', 'user_based': True} if knnmeans: algo = KNNWithMeans(sim_options=sim_options, verbose=False) else: algo = KNNBasic(sim_options=sim_options, verbose=False) algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def main(): book_df = pd.read_csv("../../data/processed/filtered_ratings.csv") # Reader object and rating scale specification book_df = book_df.drop('Unnamed: 0', axis=1) reader = Reader(rating_scale=(1, 5)) # Load data data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]], reader) # Spilt data into train and test sets train_set, test_set = train_test_split(data, test_size=0.20) algorithm_list = [ NormalPredictor(), BaselineOnly(), KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)), KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)), SVDpp(), SVD(), NMF() ] # # Fit model for normal predictor and get rmse # basic_model_based(train_set, test_set, NormalPredictor()) # # # Fit model for Baselineonly algorithm # basic_model_based(train_set, test_set, BaselineOnly()) # # # Fit model for KNN algorithms # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1))) # # plot_for_rmse(train_set, test_set) # Crossvalidation results # res = crossvalidate(data) # print(res) results = {} for algo in algorithm_list: rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo) print("Algorithm:", algo, preci, recall, f1) print( "**------------------------------------------------------------------------------------------**" )
def algo_metrics(df): ''' Return metrics algo metrics for df: rmse ---Parameters--- df (Pandas DataFrame) RUS DataFrame u (int) Number of ratings threshold for users r (int) Number of ratings threshold for routeIDs ---Returns--- RMSE metrics ''' reader = sp.Reader(line_format='user item rating', sep=',', skip_lines=1) data = sp.Dataset.load_from_df(df, reader=reader) trainset, testset = train_test_split(data, test_size=.2) # Fit out of the box SVD to trainset and predict on test set algo = sp.SVD() algo.fit(trainset) predictions = algo.test(testset) return sp.accuracy.rmse(predictions)
def subswipe_data(df, n_train, n_test, n_users): np.random.seed(0) # Use only the first n users. users = [f"synthetic_{i}" for i in range(n_users)] # Create a sample with missing swipes. Number of swipes in training set is n_strain. Number in test set is n_test. user_ids = { user: np.random.choice(df["id"], n_train + n_test, replace=False) for user in users } sparse = pd.DataFrame(columns=["uid", "iid", "swipe"]) for user in users: for train_id in user_ids[user]: sparse = sparse.append( { "uid": user, "iid": train_id, "swipe": df[user].loc[train_id] }, ignore_index=True) reader = surprise.Reader(rating_scale=(0, 1)) data = surprise.Dataset.load_from_df(sparse, reader) train, test = train_test_split(data, test_size=n_test * n_users, random_state=0) # print(train, test) # # print('test', len(test), sorted(test)) # iterator = train.all_ratings() # new_df = pd.DataFrame(columns=['uid', 'iid', 'rating']) # i = 0 # for (uid, iid, rating) in iterator: # new_df.loc[i] = [uid, iid, rating] # i = i + 1 # print('train', new_df) return train, test
def get_rating_predictions(dataset_no_zeros, dataset_zeros, dataset): # import data into suprise dataset from surprise import Reader, Dataset reader = Reader(rating_scale=(0, 1)) data_no_zeros = Dataset.load_from_df( dataset_no_zeros[['userId', 'artistId', 'rating']], reader) # split dataset into train and test from surprise.model_selection import train_test_split trainset, testset = train_test_split(data_no_zeros, test_size=0.2) trainset = data_no_zeros.build_full_trainset() # fit SVD model from surprise import SVD, accuracy algo = SVD() algo.fit(trainset) # Make prediction predictions = algo.test(testset) # Test model accuracy use root mean squared error from surprise import accuracy accuracy.rmse(predictions) # Load in full data df into suprise dataset object import pandas as pd data = Dataset.load_from_df(dataset[['userId', 'artistId', 'rating']], reader) # cast suprise dataset object into suprise trainset object data = data.build_full_trainset() # Get predictions for NaN values uids = [data.to_raw_uid(uid) for uid in dataset_zeros['userId']] iids = [data.to_raw_iid(iid) for iid in dataset_zeros['artistId']] r_ui = [r_ui for r_ui in dataset_zeros['rating']] predictions = [(uids[x], iids[x], algo.predict(uids[x], iids[x], r_ui[x], verbose=False)[3]) for x in range(len(dataset_zeros['userId']))] return predictions
def main(): # save path to training data csv # convert to panda Dataframe to bypass an error #file_path = os.path.expanduser('../data/train.csv') #df = pd.read_csv(path=file_path, sep = ';') # pickle_dict = pickle.load('../data/train_update.csv') # df = pd.DataFrame(ratings_dict) # load dataset into dataframe train = pd.read_csv('../data/train_update.csv', sep=';') test = pd.read_csv('../data/test_update.csv', sep=';') reader = Reader(rating_scale=(0, 10)) print train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) test_set = Dataset.load_from_df(test[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) # load data from file # data = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) # to use when train on full train set # trainset = train_set.build_full_trainset() # validationset = trainset.build_testset() # create classifier (using a basic k nearest neighbors approach) algo = KNNBasic() trainset, testset = train_test_split(train_set, test_size=.9, random_state=1234) algo.fit(train_set) #cross_validate(algo, trainset, verbose=True) predictions = algo.test(testset) # compute MAE and RMSE accuracy.mae(predictions) accuracy.rmse(predictions)
def SVDFun(data, userSet, movieSet, userID): # Evaluate performances of our algorithm on the dataset. algo = SVD() # perf = evaluate(algo, data, measures=['RMSE', 'MAE']) # trainset = data.build_full_trainset() trainset, testset = train_test_split(data, test_size=.25) # algo.fit(trainset) # predictions = algo.test(testset) #algo = joblib.load('/Users/esthertang/Desktop/movieRecommd/myMovie/static/svdmodel.pkl') #algo = joblib.load("/Users/huangzeqian/Documents/movieRecommd/myMovie/static/svdmodel.pkl") # meanRMSE = average(perf['RMSE']) # meanMAE = average(perf['MAE']) movielist = dict() for movie in movieSet: est = algo.predict(userID, movie).est movielist[movie] = est return movielist
def PR(algo, data, num_fold): kf = KFold(n_splits=num_fold) #solo 1 vez, sin tecnica kFold trainset, testset = train_test_split(data, test_size=.2, shuffle=False) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.5) # Precision and recall can then be averaged over all users print(' Precision') print(sum(prec for prec in precisions.values()) / len(precisions)) print(' Recall') print(sum(rec for rec in recalls.values()) / len(recalls)) return 0
def rec(): resp = {} if request.method == 'POST': req = request.form.to_dict() wine_names = reviews.index.values.tolist() newvector = pd.DataFrame(index=["givenValues"], columns=wine_names) for key in req.keys(): newvector[key] = req[key] newvector.fillna(0, inplace=True) trainset, testset = train_test_split(reviews, test_size=0.25) algo.fit(trainset) predictions = algo.predict(newvector, ) print(predictions) # todo make vector from input and do reccomender from scores return render_template('recommend.html', resp=resp)
def evaluate(self, test_size=.25): from surprise.model_selection import cross_validate, train_test_split from surprise import accuracy recommendation_dataset = RecommendationsDataset() cross_validate(self.algorithm, recommendation_dataset.dataset, measures=['RMSE', 'MSE'], cv=5, verbose=True) train, test = train_test_split(recommendation_dataset.dataset) # train.ur # train.ir # test self.fit(train) test_predictions = self.test(test) # result print("MAE: ", accuracy.mae(test_predictions, verbose=0)) print("RMSE: ", accuracy.rmse(test_predictions, verbose=0))
def test_unknown_user_or_item(toy_data): """Ensure that all algorithms act gracefully when asked to predict a rating of an unknown user, an unknown item, and when both are unknown. """ trainset = toy_data.build_full_trainset() klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering, KNNWithZScore) for klass in klasses: algo = klass() algo.fit(trainset) algo.predict('user0', 'unknown_item', None) algo.predict('unkown_user', 'item0', None) algo.predict('unkown_user', 'unknown_item', None) # unrelated, but test the fit().test() one-liner: trainset, testset = train_test_split(toy_data, test_size=2) for klass in klasses: algo = klass() algo.fit(trainset).test(testset) with pytest.warns(UserWarning): algo.train(trainset).test(testset)
""" This module describes how to use the train_test_split() function. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import train_test_split # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.25) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions)
algos_name.append('NMF') algos.append(NMF(n_factors=2,n_epochs=10,biased=True,reg_pu=0.06,reg_qi=0.06,reg_bu=0.01,reg_bi=0.01,lr_bu=0.01,lr_bi=0.01,random_state=1)_ algos_name.append('SVD') algos.append(SVD(n_factors=5, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=1)) algos_name.append('SVDpp') algos.append(SVDpp(n_factors=1, random_state=1)) #algos_name.append('KNN') #algos.append(KNNBasic()) for name, algo in zip(algos_name, algos): print('===', name) trainset, testset = train_test_split(data, test_size=0.2, random_state=1) # train and test algorithm. predictions = algo.fit(trainset).test(testset) # Compute and print Root Mean Absolute Error accuracy.mae(predictions, verbose=True) # predict pred_test = [] for u,b in zip(user_test, book_test): pred_test.append(algo.predict(u,b).est) pred_test = np.array(pred_test) pred_test[pred_test > 10] = 10 pred_test[pred_test < 1] = 1
def test_train_test_split(toy_data): # test test_size to int and train_size to None (complement) trainset, testset = train_test_split(toy_data, test_size=2, train_size=None) assert len(testset) == 2 assert trainset.n_ratings == 3 # test test_size to float and train_size to None (complement) trainset, testset = train_test_split(toy_data, test_size=.2, train_size=None) assert len(testset) == 1 assert trainset.n_ratings == 4 # test test_size to int and train_size to int trainset, testset = train_test_split(toy_data, test_size=2, train_size=3) assert len(testset) == 2 assert trainset.n_ratings == 3 # test test_size to None (complement) and train_size to int trainset, testset = train_test_split(toy_data, test_size=None, train_size=2) assert len(testset) == 3 assert trainset.n_ratings == 2 # test test_size to None (complement) and train_size to float trainset, testset = train_test_split(toy_data, test_size=None, train_size=.2) assert len(testset) == 4 assert trainset.n_ratings == 1 # Test random_state parameter # If random_state is None, you get different split each time (conditioned # by rng of course) _, testset_a = train_test_split(toy_data, random_state=None) _, testset_b = train_test_split(toy_data, random_state=None) assert testset_a != testset_b # Repeated called to split when random_state is set lead to the same folds _, testset_a = train_test_split(toy_data, random_state=1) _, testset_b = train_test_split(toy_data, random_state=1) assert testset_a == testset_b # Test shuffle parameter, if False then splits are the same regardless of # random_state. _, testset_a = train_test_split(toy_data, random_state=1, shuffle=None) _, testset_b = train_test_split(toy_data, random_state=1, shuffle=None) assert testset_a == testset_b