예제 #1
0
    def fit(self):
        """
        The fit process for the IndexQuery consists in building a query using the features of the positive items ONLY
        (items that the user liked). The terms relative to these 'positive' items are boosted by the
        rating he/she gave.

        This method uses extracted features of the positive items stored in a private attribute, so
        process_rated() must be called before this method.

        The built query will also be stored in a private attribute.
        """
        recsys_logger.info("Building query")
        # For each field of each document one string (containing the name of the field and the data in it)
        # is created and added to the query.
        # Also each part of the query that refers to a document
        # is boosted by the score given by the user to said document
        string_query = "("
        for doc, score in zip(self.__positive_user_docs.keys(), self.__scores):
            string_query += "("
            for field_name in self.__positive_user_docs[doc]:
                if field_name == 'content_id':
                    continue
                word_list = self.__positive_user_docs[doc][field_name].split()
                string_query += field_name + ":("
                for term in word_list:
                    string_query += term + " "
                string_query += ") "
            string_query += ")^" + str(score) + " "
        string_query += ") "

        self.__string_query = string_query
예제 #2
0
    def process_rated(self, user_ratings: pd.DataFrame, index_directory: str):
        """
        Function that extracts features from positive rated items ONLY!
        The extracted features will be used to fit the algorithm (build the query).

        Features extracted will be stored in private attributes of the class.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            index_directory (str): path of the index folder
        """
        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Initializes positive_user_docs which is a dictionary that has the document_id as key and
        # another dictionary as value. The dictionary value has the name of the field as key
        # and its contents as value. By doing so we obtain the data of the fields while
        # also storing information regarding the field and the document where it was
        scores = []
        positive_user_docs = {}

        recsys_logger.info("Processing rated items")
        ix = SearchIndex(index_directory)
        for item_id, score in zip(user_ratings.to_id, user_ratings.score):
            if score >= threshold:
                # {item_id: {"item": item_dictionary, "score": item_score}}
                item_query = ix.query(item_id, 1, classic_similarity=self.__classic_similarity)
                if len(item_query) != 0:
                    item = item_query.pop(item_id).get('item')
                    scores.append(score)
                    positive_user_docs[item_id] = self.__get_representations(item)

        self.__positive_user_docs = positive_user_docs
        self.__scores = scores
    def rank(self, user_ratings: pd.DataFrame, items_directory: str, recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All items will be ranked.

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list will be used to calculate the rank.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be used to calculate the rank.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to rank, if None all unrated items will be used to
                calculate the rank
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(self.extract_features_item(item))

        recsys_logger.info("Calculating rank")
        if len(id_items_to_predict) > 0:
            # Fuse the input if there are dicts, multiple representation, etc.
            fused_features_items_to_pred = self.fuse_representations(features_items_to_predict, self.__embedding_combiner)

            score_labels = self.__classifier.predict_proba(fused_features_items_to_pred)
        else:
            score_labels = []

        result = {'to_id': [], 'score': []}

        for item_id, score in zip(id_items_to_predict, score_labels):
            result['to_id'].append(item_id)
            result['score'].append(score[1])

        result = pd.DataFrame(result, columns=['to_id', 'score'])

        result.sort_values(by=['score'], ascending=False, inplace=True)

        rank = result[:recs_number]

        return rank
    def predict(self,
                user_ratings: pd.DataFrame,
                items_directory: str,
                filter_list: List[str] = None) -> pd.DataFrame:
        """
        Predicts how much a user will like unrated items.

        One can specify which items must be predicted with the filter_list parameter,
        in this case ONLY items in the filter_list will be predicted.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be predicted.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            filter_list (list): list of the items to predict, if None all unrated items will be predicted
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the score predicted
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                # raises AttributeError if items are not present locally
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(
                    self.extract_features_item(item))

        recsys_logger.info("Calculating score predictions")
        if len(id_items_to_predict) > 0:
            # Fuse the input if there are dicts, multiple representation, etc.
            fused_features_items_to_pred = self.fuse_representations(
                features_items_to_predict, self.__embedding_combiner)

            score_labels = self.__regressor.predict(
                fused_features_items_to_pred)
        else:
            score_labels = []

        # Build the score_frame to return
        columns = ["to_id", "score"]
        score_frame = pd.DataFrame(columns=columns)

        score_frame["to_id"] = id_items_to_predict
        score_frame["score"] = score_labels

        return score_frame
예제 #5
0
    def rank(self, user_ratings: pd.DataFrame, items_directory: str, recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All unrated items will be ranked (or only items in the filter list, if specified).

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list parameter will be ranked.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be ranked.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking, if None
                all unrated items will be ranked
            filter_list (list): list of the items to rank, if None all unrated items will be ranked
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(self.extract_features_item(item))

        recsys_logger.info("Calculating rank")
        if len(id_items_to_predict) > 0:
            # Calculate predictions, they are the similarity of the new items with the centroid vector
            features_fused = self.fuse_representations(features_items_to_predict, self.__embedding_combiner)
            similarities = [self.__similarity.perform(self.__centroid, item) for item in features_fused]
        else:
            similarities = []

        # Build the score frame
        result = {'to_id': id_items_to_predict, 'score': similarities}

        result = pd.DataFrame(result, columns=['to_id', 'score'])

        # Sort them in descending order
        result.sort_values(by=['score'], ascending=False, inplace=True)

        rank = result[:recs_number]

        return rank
    def process_rated(self, user_ratings: pd.DataFrame, items_directory: str):
        """
        Function that extracts features from rated item and labels them.
        The extracted features will be later used to fit the classifier.

        Features and labels will be stored in private attributes of the class.

        IF there are no rated_items available locally or if there are only positive/negative
        items, an exception is thrown.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
        """
        # Load rated items from the path
        rated_items = get_rated_items(items_directory, user_ratings)

        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Assign label and extract features from the rated items
        labels = []
        rated_dict = {}

        recsys_logger.info("Processing rated items")
        for item in rated_items:
            if item is not None:
                rated_dict[item] = self.extract_features_item(item)

                # This conversion raises Exception when there are multiple same to_id for the user
                score_assigned = float(user_ratings[user_ratings['to_id'] == item.content_id].score)
                if score_assigned >= threshold:
                    labels.append(1)
                else:
                    labels.append(0)

        if user_ratings.empty:
            raise EmptyUserRatings("The user selected doesn't have any ratings!")

        user_id = user_ratings.from_id.iloc[0]
        if len(rated_dict) == 0:
            raise NoRatedItems("User {} - No rated item available locally!".format(user_id))
        if 0 not in labels:
            raise OnlyPositiveItems("User {} - There are only positive items available locally!".format(user_id))
        elif 1 not in labels:
            raise OnlyNegativeItems("User {} - There are only negative items available locally!".format(user_id))

        self.__labels = labels
        self.__rated_dict = rated_dict
    def fit(self):
        """
        Fit the classifier specified in the constructor with the features and labels
        extracted with the process_rated() method.

        It uses private attributes to fit the classifier, that's why the method expects no parameter.
        """
        recsys_logger.info("Fitting {} classifier".format(self.__classifier))
        self._set_transformer()

        rated_features = list(self.__rated_dict.values())

        # Fuse the input if there are dicts, multiple representation, etc.
        fused_features = self.fuse_representations(rated_features, self.__embedding_combiner)

        self.__classifier.fit(fused_features, self.__labels)
예제 #8
0
    def fit(self):
        """
        The fit process for the CentroidVector consists in calculating the centroid of the features
        of the positive items ONLY.

        This method uses extracted features of the positive items stored in a private attribute, so
        process_rated() must be called before this method.

        The built centroid will also be stored in a private attribute.
        """
        recsys_logger.info("Calculating centroid vector")
        self._set_transformer()

        positive_rated_features = list(self.__positive_rated_dict.values())

        positive_rated_features_fused = self.fuse_representations(positive_rated_features, self.__embedding_combiner)
        self.__centroid = np.array(positive_rated_features_fused).mean(axis=0)
    def process_rated(self, user_ratings: pd.DataFrame, items_directory: str):
        """
        Function that extracts features from positive rated items ONLY!
        The extracted features will be used to fit the algorithm (build the query).

        Features extracted will be stored in private attributes of the class.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
        """
        # Load rated items from the path
        rated_items = get_rated_items(items_directory, user_ratings)

        recsys_logger.info("Processing rated items")
        # If threshold wasn't passed in the constructor, then we take the mean rating
        # given by the user as its threshold
        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Calculates labels and extract features from the positive rated items
        positive_rated_dict = {}
        for item in rated_items:
            score_assigned = float(
                user_ratings[user_ratings['to_id'] == item.content_id].score)
            if item is not None and score_assigned >= threshold:

                positive_rated_dict[item] = self.extract_features_item(item)

        if user_ratings.empty:
            raise EmptyUserRatings(
                "The user selected doesn't have any ratings!")

        user_id = user_ratings.from_id.iloc[0]
        if len(rated_items) == 0 or all(rated_items) is None:
            raise NoRatedItems(
                "User {} - No rated items available locally!".format(user_id))
        if len(positive_rated_dict) == 0:
            raise OnlyNegativeItems(
                "User {} - There are only negative items available locally!")

        self.__positive_rated_dict = positive_rated_dict
예제 #10
0
    def rank(self,
             user_ratings: pd.DataFrame,
             index_directory: str,
             recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All items will be ranked.

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list will be used to calculate the rank.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be used to calculate the rank.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to rank, if None all unrated items will be used to
                calculate the rank
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        recsys_logger.info("Calculating rank")

        mask_list = self._build_mask_list(user_ratings, filter_list)

        ix = SearchIndex(index_directory)
        score_docs = ix.query(self.__string_query, recs_number, mask_list,
                              filter_list, self.__classic_similarity)

        results = {'to_id': [], 'score': []}

        for result in score_docs:

            results['to_id'].append(result)
            results['score'].append(score_docs[result]['score'])

        return pd.DataFrame(results)
    def rank(self,
             user_ratings: pd.DataFrame,
             items_directory: str,
             recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All items will be ranked.

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list will be used to calculate the rank.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be used to calculate the rank.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to rank, if None all unrated items will be used to
                calculate the rank
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        recsys_logger.info("Calculating rank")
        # we get the precedent level of the logger, so we will re-enable it at that level
        precedent_level_recsys_logger = recsys_logger.getEffectiveLevel()
        recsys_logger.setLevel(logging.WARNING)

        # Predict the rating for the items and sort them in descending order
        result = self.predict(user_ratings, items_directory, filter_list)

        result.sort_values(by=['score'], ascending=False, inplace=True)

        rank = result.head(recs_number)

        recsys_logger.setLevel(precedent_level_recsys_logger)
        return rank
    def process_feature_selection_on_fullgraph(
            self, graph: FullGraph, user_target_nodes: List[object],
            item_target_nodes: List[object]) -> FullGraph:
        """
        Given a FullGraph, this method performs feature selection on said graph. It also allows to define a custom list
        of user and item nodes which properties will be considered during the feature selection process (instead of
        using the whole set of user and item nodes).

        Args:
            graph (FullGraph): original graph on which feature selection will be performed
            user_target_nodes (list): list of user nodes (or values of said nodes) to consider in the feature selection
                process
            item_target_nodes (list): list of item nodes (or values of said nodes) to consider in the feature selection
                process

        Returns:
            Copy of the original graph from which the less important Property nodes (the ones having edges with less
            important property labels) will be removed
        """

        if any(not graph.is_user_node(node) for node in user_target_nodes):
            raise FeatureSelectionException(
                'All nodes in user_target_nodes list must be user nodes')

        if any(not graph.is_item_node(node) for node in item_target_nodes):
            raise FeatureSelectionException(
                'All nodes in item_target_nodes list must be item nodes')

        if any(not isinstance(node, UserNode) for node in user_target_nodes):
            user_target_nodes = [
                UserNode(node) if not isinstance(node, UserNode) else node
                for node in user_target_nodes
            ]

        if any(not isinstance(node, ItemNode) for node in item_target_nodes):
            item_target_nodes = [
                ItemNode(node) if not isinstance(node, ItemNode) else node
                for node in item_target_nodes
            ]

        properties_to_keep = list()
        user_fs_failed = False
        item_fs_failed = False

        recsys_logger.info("Performing Feature Selection on users")
        try:
            properties_to_keep.extend(
                self.__feature_selection_algorithm.perform(
                    graph, user_target_nodes))
        except FeatureSelectionException as e:
            recsys_logger.warning(
                str(e) + "! Users original properties will be kept")
            user_fs_failed = True

        recsys_logger.info("Performing Feature Selection on items")
        try:
            properties_to_keep.extend(
                self.__feature_selection_algorithm.perform(
                    graph, item_target_nodes))
        except FeatureSelectionException as e:
            recsys_logger.warning(
                str(e) + "! Items original properties will be kept")
            item_fs_failed = True

        # in case user feature selection or item feature selection failed
        # if both failed the original graph is returned
        # if only one of them failed, the original properties (either for items or users) are retrieved
        if user_fs_failed and item_fs_failed:
            recsys_logger.warning(
                "Since items and users original properties will be kept, "
                "the original graph will be returned")
            return graph
        elif user_fs_failed and not item_fs_failed:
            properties_to_keep.extend(
                self._get_property_labels_info(graph, graph.user_nodes))
        elif not user_fs_failed and item_fs_failed:
            properties_to_keep.extend(
                self._get_property_labels_info(graph, graph.item_nodes))

        return self.__delete_property_nodes(graph, properties_to_keep)
예제 #13
0
    def rank(self, user_id: str, graph: NXFullGraph, recs_number: int = None, filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All unrated items will be ranked (or only items in the filter list, if specified).

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list parameter will be ranked.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be ranked.

        If a feature selection algorithm is passed in the constructor, it is performed before calculating
        any prediction

        Most of the time the rank is calculated by calling the predict() method and sorting the ratings
        predicted, but it's abstract since some algorithm may implement some optimizations to calculate
        the rank.

        Args:
            user_id (str): id of the user of which predictions will be calculated
            graph (FullGraph): a FullGraph containing users, items and eventually other categories of nodes
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to predict, if None all unrated items will be predicted
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the score predicted, sorted in descending order by the 'rating' column
        """
        recsys_logger.info("Calculating rank")

        columns = ["to_id", "score"]
        score_frame = pd.DataFrame(columns=columns)

        if graph is None:
            return score_frame
        if self.feature_selection is not None:
            user_target_nodes = list(graph.user_nodes)
            # only items recommendable to the user will be considered in the feature selection process
            if filter_list is None:
                item_target_nodes = [node for node in graph.item_nodes if node not in graph.get_successors(user_id)]
            else:
                item_target_nodes = [node for node in graph.item_nodes if node not in graph.get_successors(user_id) and
                                     node in filter_list]
            graph = FeatureSelectionHandler(self.feature_selection).\
                process_feature_selection_on_fullgraph(graph, user_target_nodes, item_target_nodes)

        # run the pageRank
        if self.personalized:
            # the personalization vector is formed by the nodes that the user voted with their weight
            # + all the other nodes in the graph with weight as the min weight given by the user
            # (This because if a node isn't specified in the personalization vector will have 0 score in page rank)
            profile = self.extract_profile(graph, user_id)
            pers = {node: profile[node] if node in profile else min(set(profile.values()))
                    for node in graph._graph.nodes}

            # pagerank_scipy faster than pagerank or pagerank_numpy
            scores = nx.pagerank_scipy(graph._graph, personalization=pers)
        else:
            # pagerank_scipy faster than pagerank or pagerank_numpy
            scores = nx.pagerank_scipy(graph._graph)

        # clean the results removing user nodes, selected user profile and eventually properties
        if filter_list is not None:
            nodes = list(scores.keys())
            wrapped_index = [nodes.index(filter_str) for filter_str in filter_list if graph.node_exists(filter_str)]
            wrapped_filter = [nodes[i] for i in wrapped_index]
            scores = self.filter_result(scores, wrapped_filter)
        else:
            scores = self.clean_result(graph, scores, user_id)

        score_frame.to_id = [node.value for node in scores.keys()]
        score_frame.score = scores.values()

        score_frame.sort_values(by=["score"], ascending=False, inplace=True)

        rank = score_frame[:recs_number]

        return rank