def baseline_bias_model(df): """ Shows the performance of model based on just bias """ ratings_pandas_df = df.drop(columns=['date', 'text']) # ratings_pandas_df.columns = ['user_id', 'business_id', 'rating'] reader = Reader(rating_scale=(1, 5)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items trainset, testset = train_test_split(data) algo = BaselineOnly() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
def use_als(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) testset = trainset.build_anti_testset() predictions_ALS = algo_ALS.test(testset) accuracy_rmse \ = accuracy.rmse(predictions_ALS) accuracy_mae = accuracy.mae(predictions_ALS) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \ -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]: train = pd.DataFrame(user_item_affinities) reader = Reader(rating_scale=rating_scale) trainset = Dataset.load_from_df(train, reader).build_full_trainset() trainset_for_testing = trainset.build_testset() algo = BaselineOnly(bsl_options={'method': 'sgd'}) algo.fit(trainset) predictions = algo.test(trainset_for_testing) mean = algo.trainset.global_mean bu = { u: algo.bu[algo.trainset.to_inner_uid(u)] for u in set([u for u, i, r in user_item_affinities]) } bi = { i: algo.bi[algo.trainset.to_inner_iid(i)] for i in set([i for u, i, r in user_item_affinities]) } uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions] estimatates = [p.est for p in predictions] estimates_2 = [ p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions ] uid = pd.DataFrame(uid, columns=["user", "item", "rating"]) spread = max(uid["rating"].max(), np.abs(uid["rating"].min())) uid = list(zip(uid['user'], uid['item'], uid['rating'])) bu = defaultdict(float, bu) bi = defaultdict(float, bi) # assert estimatates == estimates_2 return mean, bu, bi, spread, uid
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def predict(trainset): print("Training the model for prediction .....") # predict ratings for all pairs (u, i) that are NOT in the training set. algo = BaselineOnly(bsl_options=bsl_options) testset = trainset.build_anti_testset() predictions = algo.fit(trainset).test(testset) return predictions
def surprise_baseline(train_file, test_file): """ Baseline with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: - Returns: numpy array: predictions """ print("baseline") algo = BaselineOnly() fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def use_sgd(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using SGD') bsl_options = { 'method': 'sgd', 'learning_rate': .005, } algo_SGD = BaselineOnly(bsl_options=bsl_options) algo_SGD.fit(trainset) testset = trainset.build_anti_testset() predictions_SGD = algo_SGD.test(testset) accuracy_rmse = accuracy.rmse(predictions_SGD) accuracy_mae = accuracy.mae(predictions_SGD) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def baseline(trainset, testset): algo = BaselineOnly() algo.fit(trainset) print("Predictions") predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) return(predictions)
class ALSModelSurprise(ALSModel): def __init__(self, params): super().__init__(params) self.algo = BaselineOnly(bsl_options=self.params) def parse_data(self, ratings): reader = Reader(rating_scale=(1, 5)) self.data = Dataset.load_from_df(ratings, reader) def update_parameters(self): self.algo.bsl_options = self.params def fit(self): self.train = self.data.build_full_trainset() self.algo.fit(self.train) def predict(self, uid, iid): ''' uid, iid should be consistent with ratings['UID','IID'] ''' return self.algo.predict(uid, iid).est def top_n_recommendations(self, uid, n=5): ''' Obtain the top n recommendation for any user. Method for the surprise library ''' scores = [] for i in range(self.train.n_items): iid = self.train.to_raw_iid(i) scores.append((iid, self.predict(uid, iid))) scores.sort(key=lambda x: x[1], reverse=True) top_n_iid = [l[0] for l in scores[:n]] pred = [l[1] for l in scores[:n]] return top_n_iid, pred def cross_validate(self, cv=5, verbose=False): cv_result = cross_validate(self.algo, self.data, \ cv=cv, verbose=verbose) rmse = cv_result['test_rmse'].mean() return rmse def grid_search(self): self._best_params = self.params self._best_rmse = self.cross_validate(cv=5) for n_epochs in [5, 10, 15, 20, 25]: for reg_u in [5, 10, 15, 20]: for reg_i in [5, 10, 15]: self.set_params(n_epochs=n_epochs, reg_u=reg_u, reg_i=reg_i) rmse = self.cross_validate(cv=5) print(n_epochs, reg_u, reg_i, rmse) if (rmse < self._best_rmse): self._best_rmse = rmse self._best_params = self.params
def baseline(trainset, testset): print("\n" + "-" * 5 + " Baseline algorithm using surprise package " + "-" * 5) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def fit(self, train): """ Fit the model """ self.model = BaselineOnly(bsl_options={ 'method': 'sgd', 'n_epochs': 30, 'reg': 0.01, 'learning_rate': 0.01 }) self.model.fit(train)
def fit(self, train): """ Fit the model """ baselineOnly = BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 25, 'reg_u': 5, 'reg_i': 3 }) baselineOnly.fit(train) self.model = baselineOnly
def grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, file_name): print('KNN Surprise manual grid search') result_train = pd.DataFrame() result_test = pd.DataFrame() # loops on the parameters for n_epoch in n_epochs: for reg_u in reg_us: for reg_i in reg_is: bsl_options = { 'method': 'als', 'n_epochs': n_epoch, 'reg_u': reg_u, 'reg_i': reg_i } algo = BaselineOnly(bsl_options=bsl_options) # Retrieve the trainset. trainset = data_train.build_full_trainset() # Build an algorithm, and train it. algo.train(trainset) #Evaluate the performance perf_train = evaluate(algo, data_train, measures=['RMSE']) perf_test = evaluate(algo, data_test, measures=['RMSE']) perf_train["n_epoch"] = n_epoch perf_train["reg_u"] = reg_u perf_train["reg_i"] = reg_i #Store the mean performance RMSE on train perf_train["rmse"] = np.mean(perf_train['rmse']) perf_test["n_epoch"] = n_epoch perf_test["reg_u"] = reg_u perf_test["reg_i"] = reg_i #Store the mean performance RMSE on test perf_test["rmse"] = np.mean(perf_test['rmse']) #Store on a dataframe result_train = result_train.append(perf_train, ignore_index=True) result_test = result_test.append(perf_test, ignore_index=True) # Save the dataframe so we will see or plot the differencies if it's interesting writer = pd.ExcelWriter(file_name, engine='xlsxwriter') result_train.to_excel(writer, 'Sheet1') result_test.to_excel(writer, 'Sheet2') writer.save()
def train(trainset, testset): """ Train the recommender model that uses the baseline algorithm which is based on similarities between users and their shared ratings of recipes :param trainset: the train set from which the model learns the pattern of ratings and similarity between different users :param testset: the testset to which the model validate its knowledge of data and ratings distribution :return: a variable containing predictions of ratings of all items given by all users """ print("Training the model for prediction ....") # BaselineOnly algorithm gave us the best rmse, # therefore, we will train and predict with BaselineOnly and use Alternating Least Squares (ALS). algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) return predictions
def baseline_only(self): """ Basic baseline prediction using global mean and user-item biases. Returns: predictions_df: The predictions of the model on the test data in Pandas Data Frame format """ algorithm = BaselineOnly() predictions = algorithm.fit(self.train_data).test(self.test_data) predictions_df = self.data.test_df.copy() predictions_df['Rating'] = [x.est for x in predictions] if self.test_purpose: self.evalueate_model(predictions_df['Rating'], 'Surprise baseline_only') return predictions_df
def als_predictions(trainset, dataset_test): algo = BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 30, 'reg_u': 6, 'reg_i': 4 }) predictions = algo.fit(trainset) list_1 = [] for x in dataset_test: i = predictions.predict( x[0], x[1]) if mode == 'test' else predictions.predict( x[0], x[1], x[2]) list_1.append((i[0], i[1], i[2], i[3])) return list_1
def test_sgd_n_epoch_field(): """Ensure the n_epoch field is taken into account.""" bsl_options = {'method': 'sgd', 'n_epochs': 1, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_1 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'sgd', 'n_epochs': 20, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_5 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
def get_surprise_base_model(trainset, testset, train_reg, test_reg, model_train_evaluation, model_test_evaluation, error_table): bsl_options = {"method": "sgd", "learning_rate": 0.01, "n_epochs": 25} algo = BaselineOnly(bsl_options=bsl_options) train_result, test_result, error_table = run_surprise( algo, trainset, testset, "BaselineOnly", error_table) model_train_evaluation["BaselineOnly"] = train_result model_test_evaluation["BaselineOnly"] = test_result train_reg["BaselineOnly"] = model_train_evaluation["BaselineOnly"][ "Prediction"] st.write("Number of nan values = " + str(train_reg.isnull().sum().sum())) test_reg["BaselineOnly"] = model_test_evaluation["BaselineOnly"][ "Prediction"] test_reg.head() st.write("Number of nan values = " + str(test_reg.isnull().sum().sum())) x_train = train_reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) x_test = test_reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) y_train = train_reg["Rating"] y_test = test_reg["Rating"] train_result, test_result, error_table, fig = train_test_xgboost( x_train, x_test, y_train, y_test, "XGB_BSL", error_table) model_train_evaluation["XGB_BSL"] = train_result model_test_evaluation["XGB_BSL"] = test_result return model_test_evaluation, model_train_evaluation, error_table, fig
def __init__(self, train_data, model_to_use=["baselineonly", "svd", "coClustering", "knn"]): """initialize class with full dataset and a set of base models to use""" AlgoBase.__init__(self) self.available_models = { "baselineonly": BaselineOnly( bsl_options={ "method": "sgd", "n_epochs": 30, "reg": 0.1, "learning_rate": 0.005 }), "svd": SVD(lr_all=0.005, n_factors=50, reg_all=0.1), "coClustering": CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3), "knn": KNNWithMeans(k=40, sim_options={ "name": "cosine", "user_based": False }), } self.model_selection = [] for model in model_to_use: self.model_selection.append([model, self.available_models[model]]) self.model_rmse = {} self.model_mae = {} self.model_list = {} self.trainset = train_data.build_full_trainset()
def test_surprise(train, test, items, algo=["baseline", "svd", "svdpp"], algo_params={}, rating_scale=(1, 5)): train_affinities = train validation_affinities = test train = pd.DataFrame(train) test = pd.DataFrame(test) reader = Reader(rating_scale=rating_scale) trainset = Dataset.load_from_df(train, reader).build_full_trainset() # testset = Dataset.load_from_df(test, reader).build_full_trainset().build_anti_testset() testset = Dataset.load_from_df(test, reader).build_full_trainset().build_testset() trainset_for_testing = trainset.build_testset() def use_algo(algo, name): start = time.time() algo.fit(trainset) predictions = algo.test(testset) end = time.time() total_time = end - start rmse = accuracy.rmse(predictions, verbose=False) mae = accuracy.mae(predictions, verbose=False) ex_ee = extraction_efficiency(algo, train_affinities, validation_affinities, surprise_get_topk, items) predictions = algo.test(trainset_for_testing) train_rmse = accuracy.rmse(predictions, verbose=False) train_mae = accuracy.mae(predictions, verbose=False) return {"algo": name, "rmse": rmse, "mae": mae, "map": ex_ee["map"], "retrieval_time": ex_ee["retrieval_time"], "train_rmse": train_rmse, "train_mae": train_mae, "time": total_time} algo_map = {"svd": SVD(**(algo_params["svd"] if "svd" in algo_params else {})), "svdpp": SVDpp(**(algo_params["svdpp"] if "svdpp" in algo_params else {})), "baseline": BaselineOnly(bsl_options={'method': 'sgd'})} results = list(map(lambda a: use_algo(algo_map[a], a), algo)) return results
def run_surprise(): # Load the movielens-100k dataset (download it if needed). data = Dataset.load_builtin('ml-100k') # Use the famous SVD algorithm. algo_svd = SVD() algo_normal = NormalPredictor() algo_baseline = BaselineOnly() algo_knnBasic = KNNBasic() # Run 5-fold cross-validation and print results. cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) cross_validate(algo_normal, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) cross_validate(algo_baseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) cross_validate(algo_knnBasic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
def benchmark(data): performance = [] algorithms = [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering(), SVD_SGD_momentum(), SVDpp_SGD_momentum() ] for algorithm in algorithms: results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', 'FCP'], cv=3, verbose=False) output = pd.DataFrame.from_dict(results).mean(axis=0) output = output.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) performance.append(output) output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values( 'test_rmse') store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def test_als_reg_i_field(): """Ensure the reg_i field is taken into account.""" bsl_options = {'method': 'als', 'reg_i': 0, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_0 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'als', 'reg_i': 10, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_10 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_als_regi_0 != rmse_als_regi_10
def __init__(self, train_data): AlgoBase.__init__(self) self.model_selection = [[ 'baselineonly', BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 25, 'reg_u': 5, 'reg_i': 3 }) ], ['svd', SVD(lr_all=0.01, n_epochs=25, reg_all=0.2)], [ 'coClustering', CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3) ], [ 'knn', KNNBasic(k=40, sim_options={ 'name': 'cosine', 'user_based': False }) ]] self.model_rmse = {} self.model_list = {} self.trainset = train_data.build_full_trainset()
def batchrunSVDpp(data, al, folds): ''' define a function to run batches of data Args: data: data file name in string. al: algorithm name in string. folds: split the data into x folds for cross-validation, interger Returns: None ''' #load the data with given data format print "load data..." data = Dataset.load_from_file(path + data, reader=reader) #split the data into x folds for cross-validation. print "Split data...." data.split(n_folds=folds) # We'll use the famous SVDpp algorithm. if al == 'SVDpp': algo = SVDpp() elif al == 'Base': algo = BaselineOnly(bsl_options=bsl_options) # Evaluate performances of the algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def crossvalidate(data): results = [] for algorithm in [ NormalPredictor(), KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)), BaselineOnly(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering() ]: result = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False) temp = pd.DataFrame.from_dict(result).mean(axis=0) temp = temp.append( pd.Series([str(algorithm).split(' ')[0].split(".")[-1]], index=['Algorithm'])) results.append(temp) rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values( 'test_rmse') return rmse_values
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def test_method_field(u1_ml100k, pkf): """Ensure the method field is taken into account.""" bsl_options = {'method': 'als'} algo = BaselineOnly(bsl_options=bsl_options) rmse_als = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'sgd'} algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_als != rmse_sgd with pytest.raises(ValueError): bsl_options = {'method': 'wrong_name'} algo = BaselineOnly(bsl_options=bsl_options) cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
def _hyperopt(self, params): algo = BaselineOnly(**params) return cross_validate(algo, self._data, measures=ACCURACY_METRICS, cv=self._cv, n_jobs=self._cv_n_jobs, verbose=self._debug)[self._metric].mean()
def test_dump(u1_ml100k): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) trainset, testset = next(PredefinedKFold().split(u1_ml100k)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset