def test__set_dataframe(self): empty_frame = pd.DataFrame() kf = KFoldPartitioning(n_splits=2) with self.assertRaises(PartitionError): kf.set_dataframe(empty_frame)
def test_iter(self): kf = KFoldPartitioning() kf.set_dataframe(original_frame) for train, test in kf: original_list = [ list(row) for row in original_frame.itertuples(index=False) ] train_list = [list(row) for row in train.itertuples(index=False)] test_list = [list(row) for row in test.itertuples(index=False)] # Check that train and test are a partition train_not_in_test = [ row for row in train_list if row not in test_list ] self.assertCountEqual( train_list, train_not_in_test) # Count so regardless of order test_not_in_train = [ row for row in test_list if row not in train_list ] self.assertCountEqual( test_list, test_not_in_train) # Count so regardless of order # Check that the union of the two give the original data union_list = train_list + test_list self.assertCountEqual(original_list, union_list) # Count so regardless of order
def test_fit_graph_w_testrating_methodology(self): graph = NXFullGraph(ratings) rs = GraphBasedRS(NXPageRank(), graph) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()]) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_split__single_kfold(self): user_ratings = pd.DataFrame.from_dict( {'from_id': ["001", "001", "001", "001"], 'to_id': ["iphone", "ps4", "ps5", "xbox"], 'rating': [0.8, 0.7, -0.4, 1.0]}) n_split = 2 pm = PartitionModule(KFoldPartitioning(n_split)) user_splits = pm._split_single(user_ratings) # No further tests since the partitioning technique is tested singularly self.assertEqual(len(user_splits), n_split)
def test_split_all_kfold(self): all_ratings = pd.DataFrame( {'from_id': ["001", "001", "001", "001", "002", "002", "002", "003", "003"], 'to_id': ["iphone", "ps4", "ps5", "xbox", "realme", "airpods", "ps4", "beats", "dvd"], 'rating': [0.8, 0.7, -0.4, 1.0, 0.8, 0.7, -0.4, 1.0, 0.65]}) n_split = 2 pm = PartitionModule(KFoldPartitioning(n_split)) split_list = pm.split_all(all_ratings, set(all_ratings.from_id)) # No further tests since the partitioning technique is tested singularly self.assertEqual(len(split_list), n_split)
def test_fit_cb_w_testrating_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()]) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_all(self): ratings_filename = os.path.join(contents_path, '..', 'datasets', 'examples', 'new_ratings.csv') ratings_frame = RatingsImporter( CSVFile(ratings_filename)).import_ratings() rs = ContentBasedRS( LinearPredictor( {"Plot": ['tfidf', 'embedding']}, SkLinearRegression(), ), ratings_frame, items_dir) catalog = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[ Precision(sys_average='micro'), PrecisionAtK(1, sys_average='micro'), RPrecision(), Recall(), RecallAtK(3, ), FMeasure(1, sys_average='macro'), FMeasureAtK(2, beta=1, sys_average='micro'), NDCG(), NDCGAtK(3), MRR(), MRRAtK(5, ), Correlation('pearson', top_n=5), Correlation('kendall', top_n=3), Correlation('spearman', top_n=4), MAE(), MSE(), RMSE(), CatalogCoverage(catalog), CatalogCoverage(catalog, k=2), CatalogCoverage(catalog, top_n=3), GiniIndex(), GiniIndex(top_n=3), DeltaGap({ 'primo': 0.5, 'secondo': 0.5 }) ], methodology=TestItemsMethodology()) result = em.fit()
def test_graph(self): catalog = set(ratings.to_id) users_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/') graph = NXFullGraph( ratings, user_contents_dir=users_dir, item_contents_dir=items_dir, item_exo_representation="dbpedia", user_exo_representation='local', item_exo_properties=['starring'], user_exo_properties=['1' ] # It's the column in the users .DAT which # identifies the gender ) graph_rs = GraphBasedRS(NXPageRank(), graph) em = EvalModel(graph_rs, KFoldPartitioning(), metric_list=[ Precision(relevant_threshold=3), Recall(), FMeasure(beta=1), FMeasure(beta=2, sys_average='micro'), MRR(), Correlation('pearson'), GiniIndex(), DeltaGap({ 'popular': 0.5, 'niche': 0.5 }), PredictionCoverage(catalog), PopProfileVsRecs(user_groups={ 'popular': 0.5, 'niche': 0.5 }, out_dir='plots/'), LongTailDistr('plots/', format='svg'), PopRecsCorrelation('plots/') ], verbose_predictions=True, methodology=TestItemsMethodology()) em.fit()
def test_fit_graph_w_allitems_methodology(self): graph = NXFullGraph(ratings) rs = GraphBasedRS(NXPageRank(), graph) items = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()], methodology=AllItemsMethodology(items)) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_fit_cb_w_allitems_methodology(self): rs = ContentBasedRS( CentroidVector( {"Plot": "tfidf"}, CosineSimilarity(), ), ratings, items_dir) items = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()], methodology=AllItemsMethodology(items)) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_all_skipping_user_exception(self): all_ratings = pd.DataFrame( {'from_id': ["001", "001", "001", "001", "002", "002", "002", "003", "004", "004"], 'to_id': ["iphone", "ps4", "ps5", "xbox", "realme", "airpods", "ps4", "beats", "ps4", "ps5"], 'rating': [0.8, 0.7, -0.4, 1.0, 0.8, 0.7, -0.4, 1.0, 0.3, 0.6]}) n_split = 2 pm = PartitionModule(KFoldPartitioning(n_split)) split_list = pm.split_all(all_ratings, set(all_ratings.from_id)) # No further tests since the partitioning technique is tested singularly self.assertEqual(len(split_list), n_split) # Check that there are all users except 003 which is skipped since it has only 1 rating for split in split_list: self.assertIn('001', split.train['from_id'].values) self.assertIn('001', split.test['from_id'].values) self.assertIn('002', split.train['from_id'].values) self.assertIn('002', split.test['from_id'].values) self.assertNotIn('003', split.train['from_id'].values) self.assertNotIn('003', split.test['from_id'].values) self.assertIn('004', split.train['from_id'].values) self.assertIn('004', split.test['from_id'].values)