def test_index_query(self): movies_index = os.path.join(dir_test_files, 'complex_contents', 'index/') filter_list = ['tt0114319', 'tt0114388'] recs_number = 3 # Test prediction and ranking with the Index Query algorithm alg = IndexQuery({'Plot': ['index_original', 'index_preprocessed']}) rs = ContentBasedRS(alg, ratings, movies_index) # Prediction should raise error since it's not a ScorePredictionAlg with self.assertRaises(NotPredictionAlg): rs.fit_predict('A000') result_rank = rs.fit_rank('A000') self.assertGreater(len(result_rank), 0) # Test prediction and ranking with the IndexQuery algorithm on specified items, prediction will raise exception # since it's not a PredictionAlgorithm with self.assertRaises(NotPredictionAlg): rs.fit_predict('A000', filter_list=filter_list) result_rank_filtered = rs.fit_rank('A000', filter_list=filter_list) self.assertGreater(len(result_rank_filtered), 0) # Test top-n ranking with the IndexQuery algorithm result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number) self.assertEqual(len(result_rank_numbered), recs_number)
def test_linear_predictor(self): recs_number = 3 # Test prediction and ranking with the Classifier Recommender algorithm alg = LinearPredictor({'Plot': ['tfidf', 'embedding']}, SkLinearRegression()) rs = ContentBasedRS(alg, ratings, self.movies_multiple) # Prediction result_pred_filtered = rs.fit_predict('A000', filter_list=self.filter_list) self.assertEqual(len(result_pred_filtered), len(self.filter_list)) # Test ranking with the Classifier Recommender algorithm on specified items result_rank_filtered = rs.fit_rank('A000', filter_list=self.filter_list) self.assertEqual(len(result_rank_filtered), len(self.filter_list)) # Test top-n ranking with the Classifier Recommender algorithm result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number) self.assertEqual(len(result_rank_numbered), recs_number)
def test_fit_cb_w_testrating_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()]) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_all(self): ratings_filename = os.path.join(contents_path, '..', 'datasets', 'examples', 'new_ratings.csv') ratings_frame = RatingsImporter( CSVFile(ratings_filename)).import_ratings() rs = ContentBasedRS( LinearPredictor( {"Plot": ['tfidf', 'embedding']}, SkLinearRegression(), ), ratings_frame, items_dir) catalog = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[ Precision(sys_average='micro'), PrecisionAtK(1, sys_average='micro'), RPrecision(), Recall(), RecallAtK(3, ), FMeasure(1, sys_average='macro'), FMeasureAtK(2, beta=1, sys_average='micro'), NDCG(), NDCGAtK(3), MRR(), MRRAtK(5, ), Correlation('pearson', top_n=5), Correlation('kendall', top_n=3), Correlation('spearman', top_n=4), MAE(), MSE(), RMSE(), CatalogCoverage(catalog), CatalogCoverage(catalog, k=2), CatalogCoverage(catalog, top_n=3), GiniIndex(), GiniIndex(top_n=3), DeltaGap({ 'primo': 0.5, 'secondo': 0.5 }) ], methodology=TestItemsMethodology()) result = em.fit()
def test_multiple(self): recs_number = 3 user_id_list = ['A000', 'A001'] alg = LinearPredictor({'Plot': ['tfidf', 'embedding']}, SkLinearRegression()) rs = ContentBasedRS(alg, ratings, self.movies_multiple) # Prediction result_pred_filtered = rs.multiple_fit_predict( user_id_list, filter_list=self.filter_list) self.assertEqual(set(user_id_list), set(result_pred_filtered['from_id'])) for user in user_id_list: self.assertEqual( len(result_pred_filtered.query('from_id == @user')), len(self.filter_list)) # Test ranking with the Classifier Recommender algorithm on specified items result_rank_filtered = rs.multiple_fit_rank( user_id_list, filter_list=self.filter_list) self.assertEqual(set(user_id_list), set(result_rank_filtered['from_id'])) for user in user_id_list: self.assertEqual( len(result_rank_filtered.query('from_id == @user')), len(self.filter_list)) # Test top-n ranking with the Classifier Recommender algorithm result_rank_numbered = rs.multiple_fit_rank(user_id_list, recs_number=recs_number) self.assertEqual(set(user_id_list), set(result_rank_numbered['from_id'])) for user in user_id_list: self.assertEqual( len(result_rank_numbered.query('from_id == @user')), recs_number)
def test_calc_scores_content_based(self): recsys = ContentBasedRS( LinearPredictor({'Plot': 'tfidf'}, SkLinearRegression()), self.ratings_original, movies_dir) # We just need a Metric of the ScoresNeededMetric class to test metric_list = [MAE()] valid_metric = PredictionCalculator(self.split_list, recsys).calc_predictions( self.test_items_list, metric_list) score_truth = ScoresNeededMetric.score_truth_list # We expect this to be empty, since there are no RankingNeededMetric in the metric list rank_truth = RankingNeededMetric.rank_truth_list self.assertEqual(valid_metric, metric_list) self.assertGreater(len(score_truth), 0) self.assertEqual(len(rank_truth), 0)
def test_pop_invalid_metric(self): recsys = ContentBasedRS( ClassifierRecommender({'Plot': 'tfidf'}, SkKNN(), threshold=3), self.ratings_original, movies_dir) # Tries to calc score predictions with a pure ranking algorithm metric_list = [MAE()] valid_metric = PredictionCalculator(self.split_list, recsys).calc_predictions( self.test_items_list, metric_list) score_truth = ScoresNeededMetric.score_truth_list rank_truth = RankingNeededMetric.rank_truth_list # The metric is excluded from the valid ones and nothing is calculated since # there aren't any others self.assertEqual(len(valid_metric), 0) self.assertEqual(len(score_truth), 0) self.assertEqual(len(rank_truth), 0) # Tries to calc score predictions with a pure ranking algorithm but there are also # other type of metrics metric_ranking = NDCG() metric_score = MAE() metric_list = [metric_score, metric_ranking] valid_metric = PredictionCalculator(self.split_list, recsys).calc_predictions( self.test_items_list, metric_list) score_truth = ScoresNeededMetric.score_truth_list rank_truth = RankingNeededMetric.rank_truth_list # The metric MAE is excluded from the valid ones but NDCG is valid so predictions # for that metric (RankingNeededMetric) are calculated self.assertIn(metric_ranking, valid_metric) self.assertNotIn(metric_score, valid_metric) self.assertEqual(len(score_truth), 0) self.assertGreater(len(rank_truth), 0)
def test_fit_cb_w_allitems_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) items = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()], methodology=AllItemsMethodology(items)) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_calc_rank_content_based(self): recsys = ContentBasedRS( ClassifierRecommender({'Plot': 'tfidf'}, SkKNN(), threshold=3), self.ratings_original, movies_dir) # We just need a Metric of the RankingNeededMetric class to test metric_list = [NDCG()] valid_metric = PredictionCalculator(self.split_list, recsys).calc_predictions( self.test_items_list, metric_list) rank_truth = RankingNeededMetric.rank_truth_list # We expect this to be empty, since there are no ScoresNeededMetric in the metric list score_truth = ScoresNeededMetric.score_truth_list self.assertEqual(valid_metric, metric_list) self.assertGreater(len(rank_truth), 0) self.assertEqual(len(score_truth), 0)
def test_classifier_recommender(self): recs_number = 3 # Test prediction and ranking with the Classifier Recommender algorithm alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC()) rs = ContentBasedRS(alg, ratings, self.movies_multiple) # Prediction should raise error since it's not a ScorePredictionAlg with self.assertRaises(NotPredictionAlg): rs.fit_predict('A000') # Test ranking with the Classifier Recommender algorithm on specified items result_rank_filtered = rs.fit_rank('A000', filter_list=self.filter_list) self.assertEqual(len(result_rank_filtered), len(self.filter_list)) # Test top-n ranking with the Classifier Recommender algorithm result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number) self.assertEqual(len(result_rank_numbered), recs_number)
def test_empty_frame(self): ratings_only_positive = pd.DataFrame.from_records( [("A000", "tt0114576", 5, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) ratings_only_negative = pd.DataFrame.from_records( [("A000", "tt0114576", 1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) ratings_item_inexistent = pd.DataFrame.from_records( [("A000", "not exists", 1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) # ClassifierRecommender returns an empty frame alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_only_positive, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) # CentroidVector returns an empty frame alg = CentroidVector({'Plot': ['tfidf', 'embedding']}, CosineSimilarity(), threshold=3) rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = CentroidVector({'Plot': ['tfidf', 'embedding']}, CosineSimilarity(), threshold=3) rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty)