示例#1
0
    def add_link(self,
                 start_node: object,
                 final_node: object,
                 weight: float = None,
                 label: str = None):
        """
        Creates a weighted link connecting the 'start_node' to the 'final_node'
        Both nodes must be present in the graph before calling this method

        'weight' and 'label' are optional parameters, if not specified default values
        will be used.

        Args:
            start_node (object): starting node of the link
            final_node (object): ending node of the link
            weight (float): weight of the link, default is 0.5
            label (str): label of the link, default is 'score_label'
        """
        if label is None:
            label = self.get_default_score_label()

        if weight is None:
            weight = self.get_default_weight()

        if self.node_exists(start_node) and self.node_exists(final_node):

            self.__graph.add_edge(start_node,
                                  final_node,
                                  weight=weight,
                                  label=label)
        else:
            logger.warning(
                "One of the nodes or both don't exist in the graph! Add them before "
                "calling this method.")
示例#2
0
    def produce_content(
            self, field_name: str,
            preprocessor_list: List[InformationProcessor],
            source: RawInformationSource) -> List[FieldRepresentation]:
        representation_list: List[FieldRepresentation] = []

        # if the embedding source is an EmbeddingLearner (meaning it can be trained) and the source has no model
        # the source is trained
        if isinstance(
                self.__embedding_source,
                EmbeddingLearner) and self.__embedding_source.model is None:
            logger.warning(
                "The model %s wasn't found, so it will be created and trained now"
                % self.__embedding_source.reference)
            logger.warning("The model will be trained on the %s field "
                           "and the data will be processed with %s" %
                           (field_name, preprocessor_list))
            self.__embedding_source.fit(source, [field_name],
                                        preprocessor_list)

        # it iterates over all contents contained in the source in order to retrieve the raw data
        # the data contained in the field_name is processed using each information processor in the processor_list
        # the data is passed to the method that will create the single representation
        for content_data in source:
            processed_data = self.process_data(content_data[field_name],
                                               preprocessor_list)
            representation_list.append(
                self.produce_single_repr(processed_data))

        return representation_list
示例#3
0
    def _prop_by_rep(self, content: Content, node: object, exo_rep: str, exo_props: List[str], row: dict):
        """
        Private method that extracts from the 'content' loaded, the 'exo_props' passed
        from the 'exo_rep' passed, then creates a link between the 'node' passed and properties
        extracted.

            EXAMPLE:
                exo_rep = 0
                exo_props = ['producer', 'director']

                will extract the 'producer' and 'director' property from the representation '0'
                in the 'content' parameter and creates a link from the 'node' passed to said
                properties

        Args:
            content (Content): content loaded
            node (object): node to add properties to
            exo_rep (str): representation from where to extract the 'exo_props'
            exo_props (list): the properties list to extract from 'content'
            row (dict): dict-like object containing eventual score for the properties
        """
        properties = None
        try:
            properties = content.get_exogenous_representation(exo_rep).value
        except KeyError:
            logger.warning("Representation " + exo_rep + " not found for content " + content.content_id)

        if properties is not None:
            for prop in exo_props:
                if prop in properties.keys():
                    preference = self.get_preference(prop, row)
                    self.add_property_node(properties[prop])
                    self.add_link(node, properties[prop], preference, prop)
                else:
                    logger.warning("Property " + prop + " not found for content " + content.content_id)
示例#4
0
    def _all_prop_in_rep(self, content, node, exo_rep, row):
        """
        Private method that extracts from the 'content' loaded, ALL properties
        from the 'exo_rep' passed, then creates a link between the 'node' passed and properties
        extracted.

            EXAMPLE:
                exo_rep = 0

                will extract ALL properties from the representation '0' in the 'content' parameter and
                creates a link from the 'node' passed to said properties

        Args:
            content (Content): content loaded
            node (object): node to add properties to
            exo_rep (str): representation from where to extract the 'exo_props'
            row (dict): dict-like object containing eventual score for the properties
        """
        properties = None

        try:
            properties = content.get_exogenous_representation(exo_rep).value
        except KeyError:
            logger.warning("Representation " + exo_rep + " not found for content " + content.content_id)

        if properties is not None:
            for prop_key in properties.keys():
                preference = self.get_preference(prop_key, row)
                self.add_property_node(properties[prop_key])
                self.add_link(node, properties[prop_key], preference, prop_key)

            if len(properties) == 0:
                logger.warning("The chosen representation doesn't have any property!")
 def remove_link(self, start_node: object, final_node: object):
     try:
         self._graph.remove_edge(start_node, final_node)
     except nx.NetworkXError:
         logger.warning(
             "No link exists between the start node and the final node!\n"
             "No link will be removed")
示例#6
0
    def get_successors(self, node: object) -> List[object]:
        """
        Returns a list containing the successors of the node passed.
        Returns None if the node doesn't exists in the graph.

        Taken from networkx library:
        "A successor of n is a node m such that there exists a directed
        edge from n to m"

        EXAMPLE:
                U1 --> I2
                ↓
                I1

            get_successors(u1) ---> [I1, I2]


        Args:
            node(object): node of which we want to calculate successors
        """
        if not self.node_exists(node):
            logger.warning(
                "The node specified is not in the graph! Return None")
            return None
        else:
            return list(self.__graph.successors(node))
示例#7
0
    def add_link(self, start_node: object, final_node: object, weight: float = None, label: str = None):
        """
        Creates a weighted link connecting the 'start_node' to the 'final_node'
        Both nodes must be present in the graph before calling this method

        'weight' and 'label' are optional parameters, if not specified default values
        will be used.

        Args:
            start_node (object): starting node of the link
            final_node (object): ending node of the link
            weight (float): weight of the link, default is 0.5
            label (str): label of the link, default is 'score_label'
        """
        if label is None:
            label = self.get_default_score_label()

        if weight is None:
            weight = self.get_default_weight()

        if self.node_exists(start_node) and self.node_exists(final_node):

            # We must to this so that if the 'final' node passed is 'i1' and in the graph it's a 'ItemNode'
            # we get its instance and link the start node to the instance, otherwise networkx
            # links 'start' node to the string 'i1' and not the ItemNode!!
            nodes_list = list(self._graph.nodes)
            index_first = nodes_list.index(start_node)
            index_second = nodes_list.index(final_node)

            self._graph.add_edge(nodes_list[index_first], nodes_list[index_second], weight=weight, label=label)
        else:
            logger.warning("One of the nodes or both don't exist in the graph! Add them before "
                           "calling this method.")
示例#8
0
    def calc_folds(self, labels: list):
        """
        Private functions that check what number of folds should SVM classifier do.

        By default SVM does 5 folds, so if there are less ratings we decrease the number of
        folds because it would throw an exception otherwise.
        Every class should have min 2 rated items, otherwise no folds can be executed.

        EXAMPLE:
                labels = [1 1 0 1 0]

            We count how many different values there are in the list with
            collections.Counter(labels), so:
                count = {"1": 3, "0": 2} # There are 3 rated_items of class 1
                                        # and 2 rated_items of class 0

            Then we search the min value in the dict with min(count.values()):
                min_fold = 2

        Args:
            labels: list of labels of the rated_items
        Returns:
            Number of folds to do.

        """
        count = collections.Counter(labels)
        min_fold = min(count.values())

        if min_fold < 2:
            logger.warning("There's too few rating for a class! There needs to be at least 2!\n"
                           "No folds will be executed")
        elif min_fold >= 5:
            min_fold = 5

        self.__folds = min_fold
    def perform(self, split: Split):
        """
        Calculates the correlation between the two frames and store
        the correlation plot
        Args:
              truth (pd.DataFrame): original rating frame used for recsys config
              predictions (pd.DataFrame): dataframe with recommendations for multiple users
        """

        predictions = split.pred
        truth = split.truth

        # Calculating popularity by item
        items = truth[['to_id']].values.flatten()
        pop_by_items = Counter(items)

        # Calculating num of recommendations by item
        pop_by_items = pop_by_items.most_common()
        recs_by_item = Counter(predictions[['to_id']].values.flatten())
        popularities = list()
        recommendations = list()
        popularities_no_zeros = list()
        recommendations_no_zeros = list()

        at_least_one_zero = False
        for item, pop in pop_by_items:
            num_of_recs = recs_by_item[item]

            popularities.append(pop)
            recommendations.append(num_of_recs)

            if num_of_recs != 0:
                popularities_no_zeros.append(pop)
                recommendations_no_zeros.append(num_of_recs)
            else:
                at_least_one_zero = True

        # Both when possible
        if self.__mode == 'both':
            self.build_w_zeros_plot(popularities, recommendations)
            if at_least_one_zero:
                self.build_no_zeros_plot(popularities_no_zeros,
                                         recommendations_no_zeros)
            else:
                logger.warning(
                    "There's no zero recommendation!\n"
                    "The graph with 'no-zero' won't be created, it would be identical to the 'w-zero' one!"
                )

        elif self.__mode == 'w_zeros':
            self.build_w_zeros_plot(popularities, recommendations)

        elif self.__mode == 'no_zeros':
            self.build_no_zeros_plot(popularities_no_zeros,
                                     recommendations_no_zeros)

        return pd.DataFrame()
示例#10
0
    def _prop_in_all_rep(self, content, node, exo_props, row):
        """
        Private method that extracts from the 'content' loaded, the 'exo_props' passed from
        ALL exo representation of the content, then creates a link between the 'node' passed and properties
        extracted. To avoid conflicts with multiple representations containing same properties, the
        properties extracted will be renamed as name_prop + exo_rep:

            EXAMPLE:
                exo_props = ['producer', 'director']

                will extract 'producer' and 'director' properties from ALL exogenous representation in the 'content'
                parameter and creates a link from the 'node' passed to said properties.
                The properties will be renamed as 'producer_0', 'director_0', 'producer_1', 'director_1'
                if for example the content has those two properties in the 0 exogenous representation
                and 1 exogenous representation


        Args:
            content (Content): content loaded
            node (object): node to add properties to
            exo_props (list): the properties list to extract from 'content'
            row (dict): dict-like object containing eventual score for the properties
        """
        properties = None
        properties_not_found = []
        for rep in content.exogenous_rep_dict:
            for prop in exo_props:
                if prop in content.get_exogenous_rep(rep).value:
                    if properties is None:
                        properties = {}
                    # properties = {director_0: aaaaa, director_1:bbbbb}
                    properties[prop + "_" + rep] = content.get_exogenous_rep(rep).value[prop]
                else:
                    properties_not_found.append(prop)

        if properties is not None:
            for prop_key in properties.keys():
                # EX. producer_0 -> producer so I can search for preference
                # in the original frame source
                original_prop_name = '_'.join(prop_key.split('_')[:-1])
                preference = self.get_preference(original_prop_name, row)

                self.add_property_node(properties[prop_key])
                self.add_link(node, properties[prop_key], preference, prop_key)

            if len(properties_not_found) != 0:
                for prop in properties_not_found:
                    logger.warning("Property " + prop + " not found for " + content.content_id)
        else:
            logger.warning("None of the property chosen was found for " + content.content_id)
示例#11
0
    def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str,
                candidate_item_id_list: list = None):
        """
        Finds the documents that the user liked by comparing the score given by the user to the item
        against the positive_threshold of the index_query object (if the rating is greater than the threshold,
        the document it refers to is considered liked by the user)
        After that, calls __recs_query to execute the prediction
        Args:
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            recs_number (int): how long the ranking will be
            items_directory (str): name of the directory where the items are stored
            candidate_item_id_list (list): list of the items that can be recommended, if None
                all unrated items will be used
        Returns:
            (pd.DataFrame) dataframe that for each row has a suggested item id and a rating of
                said item. This rating represents how much the item matches the query used for
                retrieving the recommendation list
        EXAMPLES:
            Find a recommendation list with two items for a user:
                predict('A000', ratings, 2, '../../example')
            Find a recommendation list with one item for a user considering a candidate list containing two items:
                predict('A000', ratings, 1, '../../example', ['tt0114885', 'tt0114388'])
            Ratings is a variable containing a dataframe with the user ratings
            Ratings dataframe columns example: "from_id", "to_id", "original_rating", "score", "timestamp"
        """
        index_path = os.path.join(items_directory, 'search_index')
        if not DEVELOPING:
            index_path = os.path.join(home_path, items_directory, 'search_index')

        scores = []
        positive_rated_document_list = []
        for item_id, score in zip(ratings.to_id, ratings.score):
            if score > self.__positive_threshold:
                item = load_content_instance(items_directory, item_id)
                if item is not None:
                    positive_rated_document_list.append(item.index_document_id)
                    scores.append(score)

        try:
            return self.__recs_query(positive_rated_document_list,
                                     ratings.to_id,
                                     scores,
                                     recs_number,
                                     index_path,
                                     candidate_item_id_list)
        except (ValueError, EmptyIndexError) as e:
            logger.warning(str(e))
            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)
            return score_frame
示例#12
0
    def remove_link(self, start_node: object, final_node: object):
        """
        Removes the link connecting the 'start_node' to the 'final_node'.
        If there's no link between the two nodes, than a warning is printed

        Args:
            start_node (object): starting node of the link to remove
            final_node (object): ending node of the link to remove
        """
        try:
            self._graph.remove_edge(start_node, final_node)
        except nx.NetworkXError:
            logger.warning("No link exists between the start node and the final node!\n"
                           "No link will be removed")
示例#13
0
    def perform(self, split: Split):
        predictions = split.pred
        truth = split.truth

        # Calculating popularity by item
        items = truth[['to_id']].values.flatten()
        pop_by_items = Counter(items)

        # Calculating num of recommendations by item
        pop_by_items = pop_by_items.most_common()
        recs_by_item = Counter(predictions[['to_id']].values.flatten())
        popularities = list()
        recommendations = list()
        popularities_no_zeros = list()
        recommendations_no_zeros = list()

        at_least_one_zero = False
        for item, pop in pop_by_items:
            num_of_recs = recs_by_item[item]

            popularities.append(pop)
            recommendations.append(num_of_recs)

            if num_of_recs != 0:
                popularities_no_zeros.append(pop)
                recommendations_no_zeros.append(num_of_recs)
            else:
                at_least_one_zero = True

        # Both when possible
        if self.__mode == 'both':
            self.build_w_zeros_plot(popularities, recommendations)
            if at_least_one_zero:
                self.build_no_zeros_plot(popularities_no_zeros,
                                         recommendations_no_zeros)
            else:
                self.build_no_zeros_plot(popularities, recommendations)
                logger.warning(
                    "There's no zero recommendation!\n"
                    "The graph with 'no-zero' is identical to the 'w-zero' one!"
                )

        elif self.__mode == 'w_zeros':
            self.build_w_zeros_plot(popularities, recommendations)

        elif self.__mode == 'no_zeros':
            self.build_no_zeros_plot(popularities_no_zeros,
                                     recommendations_no_zeros)

        return pd.DataFrame()
示例#14
0
    def _prop_in_all_rep(self, content, node, exo_props, row):
        """
        Private method that extracts from the 'content' loaded, the 'exo_props' passed from
        ALL exo representation of the content, then creates a link between the 'node' passed and properties
        extracted. To avoid conflicts with multiple representations containing same properties, the
        properties extracted will be renamed as name_prop + exo_rep:

            EXAMPLE:
                exo_props = ['producer', 'director']

                will extract 'producer' and 'director' properties from ALL exogenous representation in the 'content'
                parameter and creates a link from the 'node' passed to said properties.
                The properties will be renamed as 'producer_0', 'director_0', 'producer_1', 'director_1'
                if for example the content has those two properties in the 0 exogenous representation
                and 1 exogenous representation


        Args:
            content (Content): content loaded
            node (object): node to add properties to
            exo_props (list): the properties list to extract from 'content'
            row (dict): dict-like object containing eventual score for the properties
        """
        internal_id_list = content.exogenous_rep_container.get_internal_index()
        external_id_list = content.exogenous_rep_container.get_external_index()
        for prop in exo_props:
            property_found = False
            for id_int, id_ext in zip(internal_id_list, external_id_list):
                if prop in content.get_exogenous_representation(id_int).value:
                    property_found = True

                    # edge_label = director#0#dbpedia, director#1#datasetlocal
                    # OR edge_label = director#0, edge_label = director#1 if external id is NaN
                    edge_label = "{}#{}".format(prop, str(id_int))
                    if pd.notna(id_ext):
                        edge_label += '#{}'.format(id_ext)

                    property_node = content.get_exogenous_representation(id_int).value[prop]

                    # search preference for the property in the original frame source
                    preference = self.get_preference(prop, row)

                    self.add_property_node(property_node)
                    self.add_link(node, property_node, preference, edge_label)

            if not property_found:
                logger.warning("Property {} not found in any representation of content {}".format(prop, content.content_id))
示例#15
0
    def add_user_tree(self, user_node: object):
        """
        Add a 'user' node if is not in the graph and load properties from disk
        if the node has some
        The method will try to load the content from the 'user_contents_dir' and extract
        from the loaded content the properties specified in the constructor (user_exo_representation,
        user_exo_properties)

        Args:
            user_node (object): 'user' node to add to the graph with its properties
        """
        self.add_user_node(user_node)

        if self.get_user_contents_dir() is not None:
            self._add_usr_properties({'from_id': user_node})
        else:
            logger.warning("The dir is not specified! The node will be added with no "
                           "properties")
示例#16
0
    def get_predecessors(self, node: object) -> List[object]:
        """
        Returns a list containing the successors of the node passed.
        Returns None if the node doesn't exists in the graph.

        Taken from networkx library:
        "A predecessor of n is a node m such that there exists a directed
        edge from m to n"

        EXAMPLE:
                I1 <-- U1
                ↑
                U2

            get_successors(I1) ---> [U1, U2]

        Args:
            node(object): node of which we want to calculate predecessors
        """
        if not self.node_exists(node):
            logger.warning("The node specified is not in the graph! Return None")
        else:
            return list(self._graph.predecessors(node))
示例#17
0
    def predict(self,
                ratings: pd.DataFrame,
                recs_number: int,
                items_directory: str,
                candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        After computing the centroid of the positive rated items by the user and getting the similarity scores
        of said centroid compared with every unrated item, creates and returns a recommendation list of unrated
        items ordered by their similarity score with the centroid. A candidate_item_id_list can be passed
        which will be used instead of the unrated items.

            EXAMPLE:
                Creates a recommendation list of length 1 with the similarity to the centroid as score, only considering
                the item tt0114319 instead of all the unrated items. (Ratings is a DataFrame containing the ratings
                given by the user)
                    predict(ratings=ratings, recs_number=1, items_directory='.../somedir',
                    candidate_item_id_list=['tt0114319'])

        Args:
            candidate_item_id_list (list): list of the items that can be recommended, if None
                all unrated items will be used
            recs_number (int): how long the ranking will be
            ratings (pd.DataFrame): ratings of a user
            items_directory (str): name of the directory where the items are stored.

        Returns:
             scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id),
                and the similarities between the items and the centroid (rating)
        """
        # Loads the items and extracts features from the unrated items, then
        # extracts features from the positive rated items
        # If exception, returns an empty score_frame
        try:
            rated_items, unrated_items, unrated_features_bag_list = \
                super().preprocessing(items_directory, ratings, candidate_item_id_list)
            positive_rated_features_bag_list = self.__calc_positive_rated_baglist(
                rated_items, ratings)
        except (ValueError, FileNotFoundError) as e:
            logger.warning(str(e))
            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)
            return score_frame

        logger.info("Computing rated items centroid")
        positive_rated_items_array = transform(
            self.__transformer, positive_rated_features_bag_list)
        centroid = np.array(positive_rated_items_array).mean(axis=0)

        columns = ["to_id", "rating"]
        score_frame = pd.DataFrame(columns=columns)

        logger.info("Computing similarity between centroid and unrated items")
        unrated_items_array = transform(self.__transformer,
                                        unrated_features_bag_list)
        similarities = [
            self.__similarity.perform(centroid, item)
            for item in unrated_items_array
        ]

        for item, similarity in zip(unrated_items, similarities):
            score_frame = pd.concat([
                score_frame,
                pd.DataFrame.from_records([(item.content_id, similarity)],
                                          columns=columns)
            ],
                                    ignore_index=True)

        score_frame = score_frame.sort_values(
            ['rating'], ascending=False).reset_index(drop=True)
        score_frame = score_frame[:recs_number]

        return score_frame
示例#18
0
    def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str,
                candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        Get recommendations for a specified user.

        You must pass the the DataFrame which contains the ratings of the user, how many
        recommended item the method predict() must return, and the path of the items.
        If recommendation for certain item is needed, specify them in candidate_item_id_list
        parameter. In this case, the recommender system will return only scores for the items
        in the list, ignoring the recs_number parameter.
         EXAMPLE
            # Instantiate the ClassifierRecommender object, check its documentation if needed
             alg = ClassifierRecommender(...)

            # Get 5 most recommended items for the user 'AOOO'
             alg.predict('A000', rat, 5, path)

            # Get the score for the item 'tt0114576' for the user 'A000'
             alg.predict('A000', ratings, 1, path, ['tt0114576'])

        Args:
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            recs_number (int): How long the ranking will be
            items_directory (str): Path to the directory where the items are stored.
            candidate_item_id_list: list of the items that can be recommended, if None
            all unrated items will be used
        Returns:
            The predicted classes, or the predict values.
        """

        # Loads the items and extracts features from the unrated items, then
        # calculates labels and extracts features from the rated items
        # If exception, returns an empty score_frame
        try:
            rated_items, unrated_items, unrated_features_bag_list = \
                super().preprocessing(items_directory, ratings, candidate_item_id_list)
            labels, rated_features_bag_list = self.__calc_labels_rated_baglist(rated_items, ratings)
        except(ValueError, FileNotFoundError) as e:
            logger.warning(str(e))
            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)
            return score_frame

        # If the classifier chosen is SVM we calc how many folds the classifier
        # can do. If no folds is possible, no folds will be executed
        if isinstance(self.__classifier, SVM):
            self.__classifier.calc_folds(labels)

        self.__classifier.fit(rated_features_bag_list, labels)

        columns = ["to_id", "rating"]
        score_frame = pd.DataFrame(columns=columns)

        logger.info("Predicting scores")
        score_labels = self.__classifier.predict_proba(unrated_features_bag_list)

        for score, item in zip(score_labels, unrated_items):
            score_frame = pd.concat(
                [score_frame, pd.DataFrame.from_records([(item.content_id, score[1])], columns=columns)],
                ignore_index=True)

        score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True)
        score_frame = score_frame[:recs_number]

        return score_frame
示例#19
0
    def fit(self):
        """
        This method performs the evaluation by initializing
        internally a recommender system that produces
        recommendations for all the users in the directory
        specified in the configuration phase.
        The evaluation is performed by creating a training set,
        and a test set with its corresponding
        truth base. The ranking algorithm will use the test set as candidate items list.

        Returns:
            ranking_metric_results: has a 'from' column, representing the user_ids for
                which the metrics was computed, and then one different column for every metric
                performed. The returned DataFrames contain one row per user, and the corresponding
                metric values are given by the mean of the values obtained for that user.
        """
        # initialize recommender to call for prediction computing
        recsys = RecSys(self.config)

        # get all users in specified directory
        logger.info("Loading user instances")
        user_id_list = \
            [os.path.splitext(filename)[0]
             for filename in os.listdir(self.config.users_directory)]

        # define empty structure which will contain
        # the results
        ranking_alg_metrics_results = pd.DataFrame()

        # calculate metrics on ranking algorithm results
        if self.config.ranking_algorithm is None:
            raise ValueError(
                "You must set ranking algorithm to compute ranking metrics")
        for user_id in user_id_list:
            logger.info("Computing ranking metrics for user %s", user_id)
            user_ratings = self.config.rating_frame[
                self.config.rating_frame['from_id'] == user_id]
            user_ratings = remove_not_existent_items(
                user_ratings, self.config.items_directory)

            try:
                self.partitioning.dataframe = user_ratings
            except ValueError as e:
                logger.warning(e)
                logger.warning(
                    "The user %s doesn't have enough valid ratings. "
                    "The user will be skipped", user_id)
                continue

            for partition_index in self.partitioning:
                result_dict = {}
                train = user_ratings.iloc[partition_index[0]]
                test = user_ratings.iloc[partition_index[1]]

                truth = test.loc[:, 'to_id':'score']
                truth.columns = ["to_id", "rating"]
                recs_number = len(truth['rating'].values)
                predictions = recsys.fit_eval_ranking(train,
                                                      truth['to_id'].tolist(),
                                                      recs_number)
                for metric in self.metrics:
                    result_dict['from'] = user_id
                    result_dict[str(metric)] = metric.perform(
                        predictions, truth)

                ranking_alg_metrics_results = \
                    ranking_alg_metrics_results.append(result_dict, ignore_index=True)

        ranking_alg_metrics_results = \
            ranking_alg_metrics_results.groupby('from').mean().reset_index()

        return ranking_alg_metrics_results
    def split_user_in_groups(score_frame: pd.DataFrame, groups: Dict[str,
                                                                     float],
                             pop_items: Set[str]) -> Dict[str, Set[str]]:
        """
        Splits the DataFrames in 3 different Sets, based on the recommendation popularity of each user

        Args:
            score_frame (pd.DataFrame): DataFrame with columns = ['from_id', 'to_id', 'rating']
            groups (Dict[str, float]): each key contains the name of the group and each value contains the
            percentage of the specified group. If the groups don't cover the entire user collection,
            the rest of the users are considered in a 'default_diverse' group
            pop_items (Set[str]): set of most popular 'to_id' labels

        Returns:
            groups_dict (Dict<str, Set<str>>): key = group_name, value = Set of 'from_id' labels
        """
        num_of_users = len(set(score_frame['from_id']))
        if num_of_users < len(groups):
            raise NotEnoughUsers("You can't split in {} groups {} users! "
                                 "Try reducing number of groups".format(
                                     len(groups), num_of_users))

        for percentage_chosen in groups.values():
            if not 0 < percentage_chosen <= 1:
                raise PercentageError(
                    'Incorrect percentage! Valid percentage range: 0 < percentage <= 1'
                )
        total = sum(groups.values())
        if total > 1:
            raise PercentageError(
                "Incorrect percentage! Sum of percentage is > than 1")
        elif total < 1:
            logger.warning(
                "Sum of percentage is < than 1, "
                "the {} percentage of users will be inserted into the "
                "'default_diverse' group".format(1 - total))

        pop_ratio_by_users = pop_ratio_by_user(score_frame,
                                               most_pop_items=pop_items)
        pop_ratio_by_users.sort_values(['popularity_ratio'],
                                       inplace=True,
                                       ascending=False)

        groups_dict: Dict[str, Set[str]] = {}
        last_index = 0
        percentage = 0.0
        for group_name in groups:
            percentage += groups[group_name]
            group_index = round(num_of_users * percentage)
            if group_index == 0:
                logger.warning(
                    'Not enough rows for group {}! It will be discarded'.
                    format(group_name))
            else:
                groups_dict[group_name] = set(
                    pop_ratio_by_users['from_id'][last_index:group_index])
                last_index = group_index
        if percentage < 1:
            group_index = round(num_of_users)
            groups_dict['default_diverse'] = set(
                pop_ratio_by_users['from_id'][last_index:group_index])
        return groups_dict
示例#21
0
    def predict(self, ratings: pd.DataFrame = None, recs_number: int = 10, items_directory: str = None,
                candidate_item_id_list: List = None):
        """
        Creates a recommendation list containing the top items retrieved by the PageRank algorithm. Networkx provides
        a method to compute PageRank on networkx graphs. Two types of PageRank computations are possible.
        The first one, in case the ranking is made for a user, will be PageRank with Priors considering the user profile
        as personalization vector. The second one, in case no user is defined (empty ratings or None) will be standard
        PageRank.
        If only a subset of the user ratings is passed as an argument, the graph will be pruned from the links
        representing the ratings not considered in said subset.
        For any case in which the graph will be modified (such as Feature Selection), a copy of the original graph will
        be created, so that the original graph may be preserved for future operations.
        It's also possible to include a candidate_item_id_list, in order to consider in the ranking only nodes specified
        in that list.
        Exceptions are thrown if raised by the feature selection algorithms or if a recommendations number <= 0
        is chosen, in these cases an empty recommendation list will be returned.

        Args:
            ratings (pd.Dataframe): ratings of the user for which compute the prediction, if None or empty dataframe
                standard PageRank will be computed instead of personalized PageRank
            recs_number (int): length of the recommendation list
            items_directory (str): not used
            candidate_item_id_list (list): if a candidate list is specified, only items in the candidate list will
                be considered for recommendations (also ignoring the recommendations number)

        Returns:
            score_frame (pd.Dataframe): dataframe containing the recommendation list
        """
        try:
            graph = self.fullgraph

            if recs_number <= 0:
                raise ValueError("You must set a valid number of recommendations (> 0) in order to compute PageRank")

            if candidate_item_id_list is None:
                candidate_item_id_list = []
            if ratings is None:
                ratings = pd.DataFrame()

            if len(ratings) != 0:
                user_id = ratings['from_id'].iloc[0]
                personalized = True

                # in case only a subset of ratings from the user is passed, first of all it checks that
                # the ratings in the dataframe are a subset of the ratings in the graph's user profile
                # this is done to check that there aren't items rated by the user in the dataframe
                # but not in the graph's user profile
                user_ratings = set(ratings['to_id'].values)
                user_graph = set([node for node in graph.get_successors(user_id) if graph.is_item_node(node)])

                if not user_ratings.issubset(user_graph):
                    raise ValueError("There are ratings in the dataframe not available in the graph for the user")

                # after that it check if the ratings in the dataframe are equal to the ratings in the
                # graph's user profile. If they are equal no further operation is done, otherwise
                # the graph is simplified so that only items considered in the dataframe are
                # represented in the graph
                if not user_ratings == user_graph:

                    additional_nodes = user_graph.difference(user_ratings)
                    graph = deepcopy(self.fullgraph)

                    logger.warning("The ratings passed are less than the ratings in the graph's user profile.\n"
                                   "The graph will be pruned in order to consider only the ratings passed")

                    self.remove_links_for_user(graph, additional_nodes, user_id)

            else:
                personalized = False
                user_id = None

            # if the item or the user feature selection algorithms are instantiated it initializes the list of nodes
            # to consider in the feature selection process (which are nodes not referred in the user_ratings, so it
            # doesn't consider the user who the ratings refer to and the items that he voted) and performs the
            # feature selection which will return a list for the new properties to consider (one list for items and
            # one for users)
            if self.__item_feature_selection_algorithm is not None:
                logger.info('Computing feature selection on items')
                if len(ratings) != 0:
                    recommended_items = list(set(ratings['to_id']))
                else:
                    recommended_items = []
                recommended_items = [item for item in graph.item_nodes if item not in recommended_items]
                new_item_prop = self.__item_feature_selection_algorithm.perform(graph, recommended_items)
            else:
                new_item_prop = graph.get_item_exogenous_properties()

            if self.__user_feature_selection_algorithm is not None:
                logger.info('Computing feature selection on users')
                if len(ratings) != 0:
                    recommended_users = list(set(ratings['from_id']))
                else:
                    recommended_users = []
                recommended_users = [user for user in graph.user_nodes if user not in recommended_users]
                new_user_prop = self.__user_feature_selection_algorithm.perform(graph, recommended_users)
            else:
                new_user_prop = graph.get_user_exogenous_properties()

            # the lists created by the feature selection algorithms will be used to remove nodes from the graph so that
            # only the specified user and/or item exogenous properties will be considered
            if self.__user_feature_selection_algorithm is not None or\
                    self.__item_feature_selection_algorithm is not None:

                if graph is self.fullgraph:
                    graph = deepcopy(self.fullgraph)

                nodes_to_remove = set()
                for property_node in graph.property_nodes:
                    for predecessor in graph.get_predecessors(property_node):
                        label = graph.get_link_data(predecessor, property_node)['label']
                        label = '_'.join(label.split('_')[:-1])
                        if (new_item_prop is not None and label not in new_item_prop) and\
                                (new_user_prop is not None and label not in new_user_prop):
                            nodes_to_remove.add(property_node)
                graph._graph.remove_nodes_from(nodes_to_remove)

            # runs the PageRank either the personalized through the user profile or the standard one
            if personalized:
                profile = self.extract_profile(user_id, graph)
                if sum(profile.values()) == 0.0:
                    logger.warning("Cannot compute personalized PageRank if all the weights are the minimum "
                                   "possible value, standard PageRank will be calculated instead")
                    scores = nx.pagerank(graph._graph)
                else:
                    scores = nx.pagerank(graph._graph.to_undirected(), personalization=profile)
            else:
                scores = nx.pagerank(graph._graph)

            # cleans the results removing nodes (they can be user nodes, items in the user profile and properties)
            scores = self.clean_rank(scores, graph, user_id)
            scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
            if len(candidate_item_id_list) == 0:
                ks = list(scores.keys())
                ks = ks[:recs_number]
            else:
                ks = candidate_item_id_list
            new_scores = {k: scores[k] for k in scores.keys() if k in ks}

            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)

            for item, score in new_scores.items():
                score_frame = pd.concat(
                    [score_frame,
                     pd.DataFrame.from_records([(item.value, score)], columns=columns)],
                    ignore_index=True)

            return score_frame

        except ValueError as e:
            logger.warning(str(e))
            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)
            return score_frame