def test_graph_creation_exo_missing(self): # Test multiple graph creation possibilities with not existent exo_representations/exo_properties # Import ratings as DataFrame ratings_import = RatingsImporter( source=CSVFile(ratings_filename), from_id_column='user_id', to_id_column='item_id', score_column='points', timestamp_column='timestamp', score_processor=NumberNormalizer() ) ratings_frame = ratings_import.import_ratings() # Create graph with non-existent exo_properties g = NXFullGraph( source_frame=ratings_frame, item_contents_dir=movies_dir, user_contents_dir=user_dir, item_exo_properties=['asdds', 'dsdds'], user_exo_properties=['vvvv'] # It's the column in the users DAT which identifies the gender ) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertEqual(len(g.property_nodes), 0) # Create graph with non-existent exo_representations g = NXFullGraph( source_frame=ratings_frame, item_contents_dir=movies_dir, user_contents_dir=user_dir, item_exo_representation="asdsa", user_exo_representation="dsdssd" ) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertEqual(len(g.property_nodes), 0) # Create graph with non-existent exo_representations and non-existent exo_properties g = NXFullGraph( source_frame=ratings_frame, item_contents_dir=movies_dir, user_contents_dir=user_dir, user_exo_representation='not_exist', item_exo_representation='not_Exist2', item_exo_properties=["asdsa"], user_exo_properties=["dsdssd"] ) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertEqual(len(g.property_nodes), 0)
def test_graph_creation(self): # Test multiple graph creation possibilities # Import ratings as DataFrame ratings_import = RatingsImporter( source=CSVFile(ratings_filename), from_id_column='user_id', to_id_column='item_id', score_column='points', timestamp_column='timestamp', score_processor=NumberNormalizer() ) ratings_frame = ratings_import.import_ratings() # Create graph without setting the representation # EX. Create graph with properties 'producer' and 'starring' from # all exo representation, since there can be multiple exo representation # containing the same properties g = NXFullGraph( source_frame=ratings_frame, item_contents_dir=movies_dir, user_contents_dir=user_dir, item_exo_properties=['producer', 'starring'], user_exo_properties=['1'] # It's the column in the users DAT which identifies the gender ) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertGreater(len(g.property_nodes), 0) # Create graph without setting properties, # so ALL exo properties of the representation 0 will be retrieved g = NXFullGraph( source_frame=ratings_frame, item_contents_dir=movies_dir, user_contents_dir=user_dir, item_exo_representation="dbpedia", user_exo_representation="local" ) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertGreater(len(g.property_nodes), 0) # Create graph specifying without properties g = NXFullGraph(ratings_frame) # Simple assert just to make sure the graph is created self.assertGreater(len(g.user_nodes), 0) self.assertGreater(len(g.item_nodes), 0) self.assertEqual(len(g.property_nodes), 0)
def setUp(self) -> None: contents_path = os.path.join(root_path, 'contents') movies_dir = os.path.join(contents_path, 'movies_codified/') user_dir = os.path.join(contents_path, 'users_codified/') self.df = pd.DataFrame.from_dict({ 'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 'to_id': [ "tt0113228", "tt0113041", "tt0113228", "tt0112346", "tt0112453", "tt0112453", "tt0112346", "tt0112453" ], 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7] }) # all properties from the dbpedia item repr extracted self.g_None_item_prop: NXFullGraph = NXFullGraph( self.df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation='dbpedia', user_exo_representation='local', item_exo_properties=None, user_exo_properties=['1']) # all representations for the defined item properties are extracted self.g_None_item_repr: NXFullGraph = NXFullGraph( self.df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation=None, user_exo_representation='local', item_exo_properties=[ 'starring', 'editing', 'producer', 'production_company' ], user_exo_properties=['1']) user_4_items = list(set(self.df.query("from_id == '4'")['to_id'])) recommendable_items_for_user_4 = list( set(self.df.query("to_id not in @user_4_items")['to_id'])) self.target_item_nodes = recommendable_items_for_user_4 # production_company is the label that appears less often, so, since the example graph is pretty simple, # the result of feature selection algorithms should not include it self.less_important_property_label_None_prop = 'production_company' # same as above self.less_important_property_label_None_repr = 'production_comapny#0dbpedia'
def test_populate_from_dataframe_w_labels(self): df_label = pd.DataFrame.from_dict({ 'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 'to_id': [ "tt0112281", "tt0112302", "tt0112281", "tt0112346", "tt0112453", "tt0112453", "tt0112346", "tt0112453" ], 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7], 'label': [ 'score_df', 'score_df', 'score_df', 'score_df', 'score_df', 'score_df', 'score_df', 'score_df' ] }) g: NXFullGraph = NXFullGraph( df_label, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation="dbpedia", user_exo_representation="local", item_exo_properties=['starring'], user_exo_properties=['1' ] # It's the column in the users .DAT which # identifies the gender ) for user, item, score in zip(df_label['from_id'], df_label['to_id'], df_label['score']): expected = {'label': 'score_df', 'weight': score} result = g.get_link_data(user, item) self.assertEqual(expected, result)
def setUp(self) -> None: ratings = pd.DataFrame.from_records( [("A000", "tt0114576", 1, "54654675"), ("A000", "tt0112453", -0.2, "54654675"), ("A001", "tt0114576", 0.8, "54654675"), ("A001", "tt0112896", -0.4, "54654675"), ("A000", "tt0113041", 0.6, "54654675"), ("A002", "tt0112453", -0.2, "54654675"), ("A002", "tt0113497", 0.5, "54654675"), ("A003", "tt0112453", -0.8, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) # ContentBasedAlgorithm is an abstract class, so we need to instantiate # a subclass to test its methods self.graph = NXFullGraph(ratings) self.graph.add_property_node('Nolan') self.alg = NXPageRank()
def setUp(self) -> None: contents_path = os.path.join(root_path, 'contents') movies_dir = os.path.join(contents_path, 'movies_codified/') user_dir = os.path.join(contents_path, 'users_codified/') self.df = pd.DataFrame.from_dict({ 'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 'to_id': [ "tt0113228", "tt0113041", "tt0113228", "tt0112346", "tt0112453", "tt0112453", "tt0112346", "tt0112453" ], 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7] }) # all properties from the dbpedia item repr extracted self.g_None_item_prop: NXFullGraph = NXFullGraph( self.df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation='dbpedia', user_exo_representation='local', item_exo_properties=None, user_exo_properties=['1']) # all representations for the defined item properties are extracted self.g_None_item_repr: NXFullGraph = NXFullGraph( self.df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation=None, user_exo_representation='local', item_exo_properties=[ 'starring', 'editing', 'producer', 'production_company' ], user_exo_properties=['1']) user_4_items = list(set(self.df.query("from_id == '4'")['to_id'])) recommendable_items_for_user_4 = list( set(self.df.query("to_id not in @user_4_items")['to_id'])) users = list(set(self.df['from_id'])) self.target_user_nodes = users self.target_item_nodes = recommendable_items_for_user_4
def test_fit_graph_w_testrating_methodology(self): graph = NXFullGraph(ratings) rs = GraphBasedRS(NXPageRank(), graph) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()]) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def setUp(self) -> None: self.ratings = pd.DataFrame.from_records([ ("A000", "tt0114576", 1, "54654675"), ("A000", "tt0112453", 0.5, "54654675"), ("A001", "tt0114576", 0.5, "54654675"), ("A001", "tt0112896", 0, "54654675"), ("A000", "tt0113041", 0.75, "54654675"), ("A002", "tt0112453", 0.5, "54654675"), ("A002", "tt0113497", 0.5, "54654675"), ("A003", "tt0112453", 0, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) self.graph = NXFullGraph(self.ratings) self.filter_list = ['tt0114576', 'tt0112453', 'tt0113497']
def setUp(self) -> None: self.df = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 'to_id': ["tt0112281", "tt0112302", "tt0112281", "tt0112346", "tt0112453", "tt0112453", "tt0112346", "tt0112453"], 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]}) self.g: NXFullGraph = NXFullGraph(self.df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation="dbpedia", user_exo_representation='local', item_exo_properties=['film director'], user_exo_properties=['1'] # It's the column in the users .DAT which # identifies the gender )
def test_graph(self): catalog = set(ratings.to_id) users_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/') graph = NXFullGraph( ratings, user_contents_dir=users_dir, item_contents_dir=items_dir, item_exo_representation="dbpedia", user_exo_representation='local', item_exo_properties=['starring'], user_exo_properties=['1' ] # It's the column in the users .DAT which # identifies the gender ) graph_rs = GraphBasedRS(NXPageRank(), graph) em = EvalModel(graph_rs, KFoldPartitioning(), metric_list=[ Precision(relevant_threshold=3), Recall(), FMeasure(beta=1), FMeasure(beta=2, sys_average='micro'), MRR(), Correlation('pearson'), GiniIndex(), DeltaGap({ 'popular': 0.5, 'niche': 0.5 }), PredictionCoverage(catalog), PopProfileVsRecs(user_groups={ 'popular': 0.5, 'niche': 0.5 }, out_dir='plots/'), LongTailDistr('plots/', format='svg'), PopRecsCorrelation('plots/') ], verbose_predictions=True, methodology=TestItemsMethodology()) em.fit()
def test_fit_graph_w_allitems_methodology(self): graph = NXFullGraph(ratings) rs = GraphBasedRS(NXPageRank(), graph) items = set([ os.path.splitext(f)[0] for f in os.listdir(items_dir) if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz') ]) em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()], methodology=AllItemsMethodology(items)) sys_result, users_result = em.fit() self.assertIsInstance(sys_result, pd.DataFrame) self.assertIsInstance(users_result, pd.DataFrame)
def test_page_rank_with_feature_selection(self): # the PageRank algorithm is tested with the NXTopKPageRank Feature Selection algorithm # since the Feature Selection is already tested in the dedicated test file # this test only checks that the PageRank run works while defining a Feature Selection algorithm movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/') user_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/') df = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 'to_id': ["tt0113228", "tt0113041", "tt0113228", "tt0112346", "tt0112453", "tt0112453", "tt0112346", "tt0112453"], 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]}) # only one property from the dbpedia repr extracted graph_with_properties: NXFullGraph = NXFullGraph(df, user_contents_dir=user_dir, item_contents_dir=movies_dir, item_exo_representation='dbpedia', user_exo_representation='local', item_exo_properties=None, user_exo_properties=['1'] ) # fs standard algorithm alg = NXPageRank(feature_selection=NXTopKPageRank()) result = alg.rank('4', graph_with_properties) self.assertEqual(len(result), 2) # fs personalized algorithm alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank()) result_personalized = alg.rank('4', graph_with_properties) self.assertEqual(len(result_personalized), 2) # fs personalized algorithm and filter list alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank()) result_personalized = alg.rank('4', graph_with_properties, filter_list=['tt0113228']) self.assertEqual(len(result_personalized), 1) # fs personalized algorithm and empty filter list alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank()) result_personalized = alg.rank('4', graph_with_properties, filter_list=[]) self.assertEqual(len(result_personalized), 0)
def test_calc_rank_graph_based(self): graph = NXFullGraph(self.ratings_original) recsys = GraphBasedRS(NXPageRank(), graph) # We just need a Metric of the RankingNeededMetric class to test metric_list = [NDCG()] valid_metric = PredictionCalculator(self.split_list, recsys).calc_predictions( self.test_items_list, metric_list) rank_truth = RankingNeededMetric.rank_truth_list # We expect this to be empty, since there are no ScoresNeededMetric in the metric list score_truth = ScoresNeededMetric.score_truth_list self.assertEqual(valid_metric, metric_list) self.assertGreater(len(rank_truth), 0) self.assertEqual(len(score_truth), 0)
class TestGraphBasedAlgorithm(TestCase): def setUp(self) -> None: ratings = pd.DataFrame.from_records( [("A000", "tt0114576", 1, "54654675"), ("A000", "tt0112453", -0.2, "54654675"), ("A001", "tt0114576", 0.8, "54654675"), ("A001", "tt0112896", -0.4, "54654675"), ("A000", "tt0113041", 0.6, "54654675"), ("A002", "tt0112453", -0.2, "54654675"), ("A002", "tt0113497", 0.5, "54654675"), ("A003", "tt0112453", -0.8, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) # ContentBasedAlgorithm is an abstract class, so we need to instantiate # a subclass to test its methods self.graph = NXFullGraph(ratings) self.graph.add_property_node('Nolan') self.alg = NXPageRank() def test_clean_rank(self): rank = { UserNode("A000"): 0.5, ItemNode("tt0114576"): 0.5, UserNode("A001"): 0.5, ItemNode("tt0113497"): 0.5, ItemNode("tt0112453"): 0.5, PropertyNode("Nolan"): 0.5 } # remove from rank all nodes except Item nodes result = self.alg.clean_result(self.graph, rank, user_id="A000") expected = {"tt0113497": 0.5} self.assertEqual(expected, result) # remove from rank all nodes except Item nodes and User nodes result = self.alg.clean_result(self.graph, rank, user_id="A000", remove_users=False) expected = {"tt0113497": 0.5, "A001": 0.5, "A000": 0.5} self.assertEqual(expected, result) # remove from rank all nodes except Item nodes and keep item rated by the user result = self.alg.clean_result(self.graph, rank, user_id="A000", remove_profile=False) expected = {'tt0112453': 0.5, 'tt0113497': 0.5, 'tt0114576': 0.5} self.assertEqual(expected, result) # remove from rank all nodes except Item nodes and property nodes result = self.alg.clean_result(self.graph, rank, user_id="A000", remove_properties=False) expected = {'tt0113497': 0.5, 'Nolan': 0.5} self.assertEqual(expected, result) def test_filter_result(self): result_page_rank = { ItemNode("i1"): 0.8, ItemNode("i2"): 0.7, UserNode('u1'): 0.2, PropertyNode("p1"): 0.1 } result = self.alg.filter_result(result_page_rank, ['i1']) expected = {ItemNode("i1"): 0.8} self.assertEqual(expected, result) result = self.alg.filter_result(result_page_rank, ['u1', 'p1']) expected = {UserNode('u1'): 0.2, PropertyNode("p1"): 0.1} self.assertEqual(expected, result) # filter with non existent nodes, result will be empty result = self.alg.filter_result(result_page_rank, ['not exists', 'i20']) expected = {} self.assertEqual(expected, result) def test_extract_profile(self): result = self.alg.extract_profile(self.graph, "A000") expected = {'tt0112453': -0.2, 'tt0113041': 0.6, 'tt0114576': 1.0} self.assertEqual(expected, result) # Also if you wrap items in its corresponding type will work expected_wrapped = { ItemNode('tt0112453'): -0.2, ItemNode('tt0113041'): 0.6, ItemNode('tt0114576'): 1.0 } self.assertEqual(expected_wrapped, result) # This will fail because they are not users expected_wrapped_fake = { UserNode('tt0112453'): -0.2, UserNode('tt0113041'): 0.6, UserNode('tt0114576'): 1.0 } self.assertNotEqual(expected_wrapped_fake, result)