class TestContentBasedAlgorithm(TestCase): def setUp(self) -> None: # ContentBasedAlgorithm is an abstract class, so we need to instantiate # a subclass to test its methods. No initialization since we are not testing # methods that need it self.alg = CentroidVector({'Plot': 'tfidf'}, CosineSimilarity(), 0) def test__bracket_representation(self): item_field = {'Plot': 'tfidf', 'Genre': [0], 'Title': [0, 'trybracket'], 'Director': 5} item_field_bracketed = {'Plot': ['tfidf'], 'Genre': [0], 'Title': [0, 'trybracket'], 'Director': [5]} result = self.alg._bracket_representation(item_field) self.assertEqual(item_field_bracketed, result) def test_extract_features_item(self): movies_dir = os.path.join(contents_path, 'movies_codified/') content = load_content_instance(movies_dir, 'tt0112281') result = self.alg.extract_features_item(content) self.assertEqual(1, len(result)) self.assertIsInstance(result[0], dict)
def test_rank_multiple_representations(self): # Multiple representations with auto threshold based on the mean ratings of the user alg = CentroidVector({'Plot': ['tfidf', 'embedding'], "Genre": ['tfidf', 'embedding'], 'imdbRating': [0]}, CosineSimilarity()) user_ratings = self.ratings.query('from_id == "A000"') alg.process_rated(user_ratings, self.movies_dir) alg.fit() # rank with filter_list res_filtered = alg.rank(user_ratings, self.movies_dir, filter_list=self.filter_list) item_ranked_set = set(res_filtered['to_id']) self.assertEqual(len(item_ranked_set), len(self.filter_list)) self.assertCountEqual(item_ranked_set, self.filter_list) # rank without filter_list res_all_unrated = alg.rank(user_ratings, self.movies_dir) item_rated_set = set(user_ratings['to_id']) item_ranked_set = set(res_all_unrated['to_id']) # We expect this to be empty, since the alg should rank only unrated items (unless in filter list) rated_in_ranked = item_ranked_set.intersection(item_rated_set) self.assertEqual(len(rated_in_ranked), 0) # rank with n_recs specified n_recs = 5 res_n_recs = alg.rank(user_ratings, self.movies_dir, n_recs) self.assertEqual(len(res_n_recs), n_recs) item_rated_set = set(user_ratings['to_id']) item_ranked_set = set(res_n_recs['to_id']) # We expect this to be empty, since the alg should rank only unrated items (unless in filter list) rated_in_ranked = item_ranked_set.intersection(item_rated_set) self.assertEqual(len(rated_in_ranked), 0)
def test_rank_single_representation(self): # Single representation alg = CentroidVector({'Genre': ['embedding']}, CosineSimilarity(), threshold=0) user_ratings = self.ratings.query('from_id == "A000"') alg.process_rated(user_ratings, self.movies_dir) alg.fit() # rank with filter_list res_filtered = alg.rank(user_ratings, self.movies_dir, filter_list=self.filter_list) item_ranked_set = set(res_filtered['to_id']) self.assertEqual(len(item_ranked_set), len(self.filter_list)) self.assertCountEqual(item_ranked_set, self.filter_list) # rank without filter_list res_all_unrated = alg.rank(user_ratings, self.movies_dir) item_rated_set = set(user_ratings['to_id']) item_ranked_set = set(res_all_unrated['to_id']) # We expect this to be empty, since the alg should rank only unrated items (unless in filter list) rated_in_ranked = item_ranked_set.intersection(item_rated_set) self.assertEqual(len(rated_in_ranked), 0) # rank with n_recs specified n_recs = 5 res_n_recs = alg.rank(user_ratings, self.movies_dir, n_recs) self.assertEqual(len(res_n_recs), n_recs) item_rated_set = set(user_ratings['to_id']) item_ranked_set = set(res_n_recs['to_id']) # We expect this to be empty, since the alg should rank only unrated items (unless in filter list) rated_in_ranked = item_ranked_set.intersection(item_rated_set) self.assertEqual(len(rated_in_ranked), 0)
def test_empty_frame(self): ratings_only_positive = pd.DataFrame.from_records( [("A000", "tt0114576", 5, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) ratings_only_negative = pd.DataFrame.from_records( [("A000", "tt0114576", 1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) ratings_item_inexistent = pd.DataFrame.from_records( [("A000", "not exists", 1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) # ClassifierRecommender returns an empty frame alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_only_positive, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC(), threshold=3) rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) # CentroidVector returns an empty frame alg = CentroidVector({'Plot': ['tfidf', 'embedding']}, CosineSimilarity(), threshold=3) rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty) alg = CentroidVector({'Plot': ['tfidf', 'embedding']}, CosineSimilarity(), threshold=3) rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple) result = rs.fit_rank('A000') self.assertTrue(result.empty)
def test_predict(self): alg = CentroidVector({'Genre': ['embedding']}, CosineSimilarity(), threshold=0) user_ratings = self.ratings.query('from_id == "A000"') alg.process_rated(user_ratings, self.movies_dir) alg.fit() # Will raise Exception since it's not a Score Prediction Algorithm with self.assertRaises(NotPredictionAlg): alg.predict(user_ratings, self.movies_dir)
def test_fit_cb_w_testrating_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()]) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_centroid_vector(self): recs_number = 3 # Test prediction and ranking with the Centroid Vector algorithm alg = CentroidVector({'Plot': ['tfidf', 'embedding']}, CosineSimilarity()) rs = ContentBasedRS(alg, ratings, self.movies_multiple) # Prediction should raise error since it's not a ScorePredictionAlg with self.assertRaises(NotPredictionAlg): rs.fit_predict('A000') # Test ranking with the Centroid Vector algorithm on specified items result_rank_filtered = rs.fit_rank('A000', filter_list=self.filter_list) self.assertEqual(len(result_rank_filtered), len(self.filter_list)) # Test top-n ranking with the Centroid Vector algorithm result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number) self.assertEqual(len(result_rank_numbered), recs_number)
def test_raise_errors(self): # Only negative available self.ratings = pd.DataFrame.from_records([ ("A000", "tt0112281", -1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) alg = CentroidVector({'Plot': 'embedding'}, CosineSimilarity(), 0) user_ratings = self.ratings.query('from_id == "A000"') with self.assertRaises(OnlyNegativeItems): alg.process_rated(user_ratings, self.movies_dir) # No Item avilable locally self.ratings = pd.DataFrame.from_records([ ("A000", "non existent", 1, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) alg = CentroidVector({'Plot': 'embedding'}, CosineSimilarity(), 0) user_ratings = self.ratings.query('from_id == "A000"') with self.assertRaises(NoRatedItems): alg.process_rated(user_ratings, self.movies_dir)
def test_fit_cb_w_allitems_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) items = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()], methodology=AllItemsMethodology(items)) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def setUp(self) -> None: # ContentBasedAlgorithm is an abstract class, so we need to instantiate # a subclass to test its methods. No initialization since we are not testing # methods that need it self.alg = CentroidVector({'Plot': 'tfidf'}, CosineSimilarity(), 0)