def test_graph_creation_exo_missing(self):
        # Test multiple graph creation possibilities with not existent exo_representations/exo_properties

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(
            source=CSVFile(ratings_filename),
            from_id_column='user_id',
            to_id_column='item_id',
            score_column='points',
            timestamp_column='timestamp',
            score_processor=NumberNormalizer()
        )
        ratings_frame = ratings_import.import_ratings()

        # Create graph with non-existent exo_properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_properties=['asdds', 'dsdds'],
            user_exo_properties=['vvvv']  # It's the column in the users DAT which identifies the gender
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)

        # Create graph with non-existent exo_representations
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_representation="asdsa",
            user_exo_representation="dsdssd"
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)

        # Create graph with non-existent exo_representations and non-existent exo_properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            user_exo_representation='not_exist',
            item_exo_representation='not_Exist2',
            item_exo_properties=["asdsa"],
            user_exo_properties=["dsdssd"]
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
    def test_graph_creation(self):
        # Test multiple graph creation possibilities

        # Import ratings as DataFrame
        ratings_import = RatingsImporter(
            source=CSVFile(ratings_filename),
            from_id_column='user_id',
            to_id_column='item_id',
            score_column='points',
            timestamp_column='timestamp',
            score_processor=NumberNormalizer()
        )
        ratings_frame = ratings_import.import_ratings()

        # Create graph without setting the representation
        # EX. Create graph with properties 'producer' and 'starring' from
        # all exo representation, since there can be multiple exo representation
        # containing the same properties
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_properties=['producer', 'starring'],
            user_exo_properties=['1']  # It's the column in the users DAT which identifies the gender
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph without setting properties,
        # so ALL exo properties of the representation 0 will be retrieved
        g = NXFullGraph(
            source_frame=ratings_frame,
            item_contents_dir=movies_dir,
            user_contents_dir=user_dir,
            item_exo_representation="dbpedia",
            user_exo_representation="local"
        )

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertGreater(len(g.property_nodes), 0)

        # Create graph specifying without properties
        g = NXFullGraph(ratings_frame)

        # Simple assert just to make sure the graph is created
        self.assertGreater(len(g.user_nodes), 0)
        self.assertGreater(len(g.item_nodes), 0)
        self.assertEqual(len(g.property_nodes), 0)
示例#3
0
    def setUp(self) -> None:
        contents_path = os.path.join(root_path, 'contents')
        movies_dir = os.path.join(contents_path, 'movies_codified/')
        user_dir = os.path.join(contents_path, 'users_codified/')

        self.df = pd.DataFrame.from_dict({
            'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
            'to_id': [
                "tt0113228", "tt0113041", "tt0113228", "tt0112346",
                "tt0112453", "tt0112453", "tt0112346", "tt0112453"
            ],
            'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]
        })

        # all properties from the dbpedia item repr extracted
        self.g_None_item_prop: NXFullGraph = NXFullGraph(
            self.df,
            user_contents_dir=user_dir,
            item_contents_dir=movies_dir,
            item_exo_representation='dbpedia',
            user_exo_representation='local',
            item_exo_properties=None,
            user_exo_properties=['1'])

        # all representations for the defined item properties are extracted
        self.g_None_item_repr: NXFullGraph = NXFullGraph(
            self.df,
            user_contents_dir=user_dir,
            item_contents_dir=movies_dir,
            item_exo_representation=None,
            user_exo_representation='local',
            item_exo_properties=[
                'starring', 'editing', 'producer', 'production_company'
            ],
            user_exo_properties=['1'])

        user_4_items = list(set(self.df.query("from_id == '4'")['to_id']))
        recommendable_items_for_user_4 = list(
            set(self.df.query("to_id not in @user_4_items")['to_id']))

        self.target_item_nodes = recommendable_items_for_user_4

        # production_company is the label that appears less often, so, since the example graph is pretty simple,
        # the result of feature selection algorithms should not include it
        self.less_important_property_label_None_prop = 'production_company'

        # same as above
        self.less_important_property_label_None_repr = 'production_comapny#0dbpedia'
    def test_populate_from_dataframe_w_labels(self):
        df_label = pd.DataFrame.from_dict({
            'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
            'to_id': [
                "tt0112281", "tt0112302", "tt0112281", "tt0112346",
                "tt0112453", "tt0112453", "tt0112346", "tt0112453"
            ],
            'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7],
            'label': [
                'score_df', 'score_df', 'score_df', 'score_df', 'score_df',
                'score_df', 'score_df', 'score_df'
            ]
        })

        g: NXFullGraph = NXFullGraph(
            df_label,
            user_contents_dir=user_dir,
            item_contents_dir=movies_dir,
            item_exo_representation="dbpedia",
            user_exo_representation="local",
            item_exo_properties=['starring'],
            user_exo_properties=['1'
                                 ]  # It's the column in the users .DAT which
            # identifies the gender
        )

        for user, item, score in zip(df_label['from_id'], df_label['to_id'],
                                     df_label['score']):
            expected = {'label': 'score_df', 'weight': score}
            result = g.get_link_data(user, item)

            self.assertEqual(expected, result)
示例#5
0
    def setUp(self) -> None:
        ratings = pd.DataFrame.from_records(
            [("A000", "tt0114576", 1, "54654675"),
             ("A000", "tt0112453", -0.2, "54654675"),
             ("A001", "tt0114576", 0.8, "54654675"),
             ("A001", "tt0112896", -0.4, "54654675"),
             ("A000", "tt0113041", 0.6, "54654675"),
             ("A002", "tt0112453", -0.2, "54654675"),
             ("A002", "tt0113497", 0.5, "54654675"),
             ("A003", "tt0112453", -0.8, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        # ContentBasedAlgorithm is an abstract class, so we need to instantiate
        # a subclass to test its methods
        self.graph = NXFullGraph(ratings)

        self.graph.add_property_node('Nolan')

        self.alg = NXPageRank()
示例#6
0
    def setUp(self) -> None:
        contents_path = os.path.join(root_path, 'contents')
        movies_dir = os.path.join(contents_path, 'movies_codified/')
        user_dir = os.path.join(contents_path, 'users_codified/')

        self.df = pd.DataFrame.from_dict({
            'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
            'to_id': [
                "tt0113228", "tt0113041", "tt0113228", "tt0112346",
                "tt0112453", "tt0112453", "tt0112346", "tt0112453"
            ],
            'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]
        })

        # all properties from the dbpedia item repr extracted
        self.g_None_item_prop: NXFullGraph = NXFullGraph(
            self.df,
            user_contents_dir=user_dir,
            item_contents_dir=movies_dir,
            item_exo_representation='dbpedia',
            user_exo_representation='local',
            item_exo_properties=None,
            user_exo_properties=['1'])

        # all representations for the defined item properties are extracted
        self.g_None_item_repr: NXFullGraph = NXFullGraph(
            self.df,
            user_contents_dir=user_dir,
            item_contents_dir=movies_dir,
            item_exo_representation=None,
            user_exo_representation='local',
            item_exo_properties=[
                'starring', 'editing', 'producer', 'production_company'
            ],
            user_exo_properties=['1'])

        user_4_items = list(set(self.df.query("from_id == '4'")['to_id']))
        recommendable_items_for_user_4 = list(
            set(self.df.query("to_id not in @user_4_items")['to_id']))
        users = list(set(self.df['from_id']))

        self.target_user_nodes = users
        self.target_item_nodes = recommendable_items_for_user_4
    def test_fit_graph_w_testrating_methodology(self):
        graph = NXFullGraph(ratings)

        rs = GraphBasedRS(NXPageRank(), graph)

        em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()])

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def setUp(self) -> None:
        self.ratings = pd.DataFrame.from_records([
            ("A000", "tt0114576", 1, "54654675"),
            ("A000", "tt0112453", 0.5, "54654675"),
            ("A001", "tt0114576", 0.5, "54654675"),
            ("A001", "tt0112896", 0, "54654675"),
            ("A000", "tt0113041", 0.75, "54654675"),
            ("A002", "tt0112453", 0.5, "54654675"),
            ("A002", "tt0113497", 0.5, "54654675"),
            ("A003", "tt0112453", 0, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        self.graph = NXFullGraph(self.ratings)

        self.filter_list = ['tt0114576', 'tt0112453', 'tt0113497']
    def setUp(self) -> None:
        self.df = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
                                          'to_id': ["tt0112281", "tt0112302", "tt0112281", "tt0112346",
                                                    "tt0112453", "tt0112453", "tt0112346", "tt0112453"],
                                          'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]})

        self.g: NXFullGraph = NXFullGraph(self.df,
                                          user_contents_dir=user_dir,
                                          item_contents_dir=movies_dir,
                                          item_exo_representation="dbpedia",
                                          user_exo_representation='local',
                                          item_exo_properties=['film director'],
                                          user_exo_properties=['1']  # It's the column in the users .DAT which
                                          # identifies the gender
                                          )
示例#10
0
    def test_graph(self):
        catalog = set(ratings.to_id)

        users_dir = os.path.join(dir_test_files, 'complex_contents',
                                 'users_codified/')

        graph = NXFullGraph(
            ratings,
            user_contents_dir=users_dir,
            item_contents_dir=items_dir,
            item_exo_representation="dbpedia",
            user_exo_representation='local',
            item_exo_properties=['starring'],
            user_exo_properties=['1'
                                 ]  # It's the column in the users .DAT which
            # identifies the gender
        )

        graph_rs = GraphBasedRS(NXPageRank(), graph)

        em = EvalModel(graph_rs,
                       KFoldPartitioning(),
                       metric_list=[
                           Precision(relevant_threshold=3),
                           Recall(),
                           FMeasure(beta=1),
                           FMeasure(beta=2, sys_average='micro'),
                           MRR(),
                           Correlation('pearson'),
                           GiniIndex(),
                           DeltaGap({
                               'popular': 0.5,
                               'niche': 0.5
                           }),
                           PredictionCoverage(catalog),
                           PopProfileVsRecs(user_groups={
                               'popular': 0.5,
                               'niche': 0.5
                           },
                                            out_dir='plots/'),
                           LongTailDistr('plots/', format='svg'),
                           PopRecsCorrelation('plots/')
                       ],
                       verbose_predictions=True,
                       methodology=TestItemsMethodology())

        em.fit()
示例#11
0
    def test_fit_graph_w_allitems_methodology(self):
        graph = NXFullGraph(ratings)

        rs = GraphBasedRS(NXPageRank(), graph)

        items = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[Precision()],
                       methodology=AllItemsMethodology(items))

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_page_rank_with_feature_selection(self):
        # the PageRank algorithm is tested with the NXTopKPageRank Feature Selection algorithm
        # since the Feature Selection is already tested in the dedicated test file
        # this test only checks that the PageRank run works while defining a Feature Selection algorithm

        movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/')
        user_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/')

        df = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
                                     'to_id': ["tt0113228", "tt0113041", "tt0113228", "tt0112346",
                                               "tt0112453", "tt0112453", "tt0112346", "tt0112453"],
                                     'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]})

        # only one property from the dbpedia repr extracted
        graph_with_properties: NXFullGraph = NXFullGraph(df,
                                                         user_contents_dir=user_dir,
                                                         item_contents_dir=movies_dir,
                                                         item_exo_representation='dbpedia',
                                                         user_exo_representation='local',
                                                         item_exo_properties=None,
                                                         user_exo_properties=['1']
                                                         )

        # fs standard algorithm
        alg = NXPageRank(feature_selection=NXTopKPageRank())
        result = alg.rank('4', graph_with_properties)
        self.assertEqual(len(result), 2)

        # fs personalized algorithm
        alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank())
        result_personalized = alg.rank('4', graph_with_properties)
        self.assertEqual(len(result_personalized), 2)

        # fs personalized algorithm and filter list
        alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank())
        result_personalized = alg.rank('4', graph_with_properties, filter_list=['tt0113228'])
        self.assertEqual(len(result_personalized), 1)

        # fs personalized algorithm and empty filter list
        alg = NXPageRank(personalized=True, feature_selection=NXTopKPageRank())
        result_personalized = alg.rank('4', graph_with_properties, filter_list=[])
        self.assertEqual(len(result_personalized), 0)
    def test_calc_rank_graph_based(self):

        graph = NXFullGraph(self.ratings_original)

        recsys = GraphBasedRS(NXPageRank(), graph)

        # We just need a Metric of the RankingNeededMetric class to test
        metric_list = [NDCG()]

        valid_metric = PredictionCalculator(self.split_list,
                                            recsys).calc_predictions(
                                                self.test_items_list,
                                                metric_list)
        rank_truth = RankingNeededMetric.rank_truth_list

        # We expect this to be empty, since there are no ScoresNeededMetric in the metric list
        score_truth = ScoresNeededMetric.score_truth_list

        self.assertEqual(valid_metric, metric_list)
        self.assertGreater(len(rank_truth), 0)
        self.assertEqual(len(score_truth), 0)
示例#14
0
class TestGraphBasedAlgorithm(TestCase):
    def setUp(self) -> None:
        ratings = pd.DataFrame.from_records(
            [("A000", "tt0114576", 1, "54654675"),
             ("A000", "tt0112453", -0.2, "54654675"),
             ("A001", "tt0114576", 0.8, "54654675"),
             ("A001", "tt0112896", -0.4, "54654675"),
             ("A000", "tt0113041", 0.6, "54654675"),
             ("A002", "tt0112453", -0.2, "54654675"),
             ("A002", "tt0113497", 0.5, "54654675"),
             ("A003", "tt0112453", -0.8, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        # ContentBasedAlgorithm is an abstract class, so we need to instantiate
        # a subclass to test its methods
        self.graph = NXFullGraph(ratings)

        self.graph.add_property_node('Nolan')

        self.alg = NXPageRank()

    def test_clean_rank(self):
        rank = {
            UserNode("A000"): 0.5,
            ItemNode("tt0114576"): 0.5,
            UserNode("A001"): 0.5,
            ItemNode("tt0113497"): 0.5,
            ItemNode("tt0112453"): 0.5,
            PropertyNode("Nolan"): 0.5
        }

        # remove from rank all nodes except Item nodes
        result = self.alg.clean_result(self.graph, rank, user_id="A000")
        expected = {"tt0113497": 0.5}
        self.assertEqual(expected, result)

        # remove from rank all nodes except Item nodes and User nodes
        result = self.alg.clean_result(self.graph,
                                       rank,
                                       user_id="A000",
                                       remove_users=False)
        expected = {"tt0113497": 0.5, "A001": 0.5, "A000": 0.5}
        self.assertEqual(expected, result)

        # remove from rank all nodes except Item nodes and keep item rated by the user
        result = self.alg.clean_result(self.graph,
                                       rank,
                                       user_id="A000",
                                       remove_profile=False)
        expected = {'tt0112453': 0.5, 'tt0113497': 0.5, 'tt0114576': 0.5}
        self.assertEqual(expected, result)

        # remove from rank all nodes except Item nodes and property nodes
        result = self.alg.clean_result(self.graph,
                                       rank,
                                       user_id="A000",
                                       remove_properties=False)
        expected = {'tt0113497': 0.5, 'Nolan': 0.5}
        self.assertEqual(expected, result)

    def test_filter_result(self):
        result_page_rank = {
            ItemNode("i1"): 0.8,
            ItemNode("i2"): 0.7,
            UserNode('u1'): 0.2,
            PropertyNode("p1"): 0.1
        }

        result = self.alg.filter_result(result_page_rank, ['i1'])
        expected = {ItemNode("i1"): 0.8}
        self.assertEqual(expected, result)

        result = self.alg.filter_result(result_page_rank, ['u1', 'p1'])
        expected = {UserNode('u1'): 0.2, PropertyNode("p1"): 0.1}
        self.assertEqual(expected, result)

        # filter with non existent nodes, result will be empty
        result = self.alg.filter_result(result_page_rank,
                                        ['not exists', 'i20'])
        expected = {}
        self.assertEqual(expected, result)

    def test_extract_profile(self):

        result = self.alg.extract_profile(self.graph, "A000")
        expected = {'tt0112453': -0.2, 'tt0113041': 0.6, 'tt0114576': 1.0}

        self.assertEqual(expected, result)

        # Also if you wrap items in its corresponding type will work
        expected_wrapped = {
            ItemNode('tt0112453'): -0.2,
            ItemNode('tt0113041'): 0.6,
            ItemNode('tt0114576'): 1.0
        }
        self.assertEqual(expected_wrapped, result)

        # This will fail because they are not users
        expected_wrapped_fake = {
            UserNode('tt0112453'): -0.2,
            UserNode('tt0113041'): 0.6,
            UserNode('tt0114576'): 1.0
        }
        self.assertNotEqual(expected_wrapped_fake, result)