def rank(self, user_ratings: pd.DataFrame, items_directory: str, recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All items will be ranked.

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list will be used to calculate the rank.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be used to calculate the rank.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to rank, if None all unrated items will be used to
                calculate the rank
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(self.extract_features_item(item))

        recsys_logger.info("Calculating rank")
        if len(id_items_to_predict) > 0:
            # Fuse the input if there are dicts, multiple representation, etc.
            fused_features_items_to_pred = self.fuse_representations(features_items_to_predict, self.__embedding_combiner)

            score_labels = self.__classifier.predict_proba(fused_features_items_to_pred)
        else:
            score_labels = []

        result = {'to_id': [], 'score': []}

        for item_id, score in zip(id_items_to_predict, score_labels):
            result['to_id'].append(item_id)
            result['score'].append(score[1])

        result = pd.DataFrame(result, columns=['to_id', 'score'])

        result.sort_values(by=['score'], ascending=False, inplace=True)

        rank = result[:recs_number]

        return rank
    def predict(self,
                user_ratings: pd.DataFrame,
                items_directory: str,
                filter_list: List[str] = None) -> pd.DataFrame:
        """
        Predicts how much a user will like unrated items.

        One can specify which items must be predicted with the filter_list parameter,
        in this case ONLY items in the filter_list will be predicted.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be predicted.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            filter_list (list): list of the items to predict, if None all unrated items will be predicted
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the score predicted
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                # raises AttributeError if items are not present locally
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(
                    self.extract_features_item(item))

        recsys_logger.info("Calculating score predictions")
        if len(id_items_to_predict) > 0:
            # Fuse the input if there are dicts, multiple representation, etc.
            fused_features_items_to_pred = self.fuse_representations(
                features_items_to_predict, self.__embedding_combiner)

            score_labels = self.__regressor.predict(
                fused_features_items_to_pred)
        else:
            score_labels = []

        # Build the score_frame to return
        columns = ["to_id", "score"]
        score_frame = pd.DataFrame(columns=columns)

        score_frame["to_id"] = id_items_to_predict
        score_frame["score"] = score_labels

        return score_frame
Пример #3
0
    def rank(self, user_ratings: pd.DataFrame, items_directory: str, recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All unrated items will be ranked (or only items in the filter list, if specified).

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list parameter will be ranked.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be ranked.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking, if None
                all unrated items will be ranked
            filter_list (list): list of the items to rank, if None all unrated items will be ranked
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        # Load items to predict
        if filter_list is None:
            items_to_predict = get_unrated_items(items_directory, user_ratings)
        else:
            items_to_predict = get_chosen_items(items_directory, filter_list)

        # Extract features of the items to predict
        id_items_to_predict = []
        features_items_to_predict = []
        for item in items_to_predict:
            if item is not None:
                id_items_to_predict.append(item.content_id)
                features_items_to_predict.append(self.extract_features_item(item))

        recsys_logger.info("Calculating rank")
        if len(id_items_to_predict) > 0:
            # Calculate predictions, they are the similarity of the new items with the centroid vector
            features_fused = self.fuse_representations(features_items_to_predict, self.__embedding_combiner)
            similarities = [self.__similarity.perform(self.__centroid, item) for item in features_fused]
        else:
            similarities = []

        # Build the score frame
        result = {'to_id': id_items_to_predict, 'score': similarities}

        result = pd.DataFrame(result, columns=['to_id', 'score'])

        # Sort them in descending order
        result.sort_values(by=['score'], ascending=False, inplace=True)

        rank = result[:recs_number]

        return rank
Пример #4
0
    def __get_item_list(self, item_to_predict_id_list, user_ratings):
        if item_to_predict_id_list is None:
            # all items without rating if the list is not set
            item_to_predict_list = get_unrated_items(
                self.__config.items_directory, user_ratings)
        else:
            item_to_predict_list = [
                load_content_instance(self.__config.items_directory,
                                      re.sub(r'[^\w\s]', '', item_id))
                for item_id in item_to_predict_id_list
            ]

        return item_to_predict_list
Пример #5
0
    def preprocessing(self, items_directory: str, ratings: pd.DataFrame, candidate_item_id_list: list = None):
        """
        Function used to retrieve data that will be used in the computation of the ranking.
        It loads the rated and unrated items, computes the threshold if it was set to -1 and
        extracts the features from the unrated items.

        This method can throw two exceptions. The first one is thrown if the threshold value specified
        in the constructor of the class it's not in the range [-1, 1], the second one is thrown if,
        while considering a candidate_item_id_list passed as an argument, there are no valid
        items to consider (example: ['test', 'test2'] but neither test nor test2 are items in the
        items directory)

        Args:
            items_directory (str): directory where the items are stored
            ratings (Dataframe): dataframe which contains ratings given by the user
            candidate_item_id_list (list): list of the items that can be recommended, if None
            all unrated items will be used

        Returns:
            rated_items (list): list containing the instances of the rated items
            unrated_items (list): list containing the instances of the unrated items
            unrated_features_baglist (list): list containing the features extracted from the unrated items
        """

        # If threshold is the min possible (range is [-1, 1]), we calculate the mean value
        # of all the ratings and set it as the threshold. Also an exception is thrown if the
        # threshold value is not in the range
        if not -1 <= self.__threshold <= 1:
            raise ValueError("Threshold value must be in the range [-1, 1]")

        if self.__threshold == -1:
            self.__threshold = pd.to_numeric(ratings["score"], downcast="float").mean()

        # Load unrated items from the path
        if candidate_item_id_list is None or len(candidate_item_id_list) == 0:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            # If a candidate list is specified, it loads only items that are valid (it doesn't add None to the list)
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list
                             if load_content_instance(items_directory, item_id) is not None]

        if len(unrated_items) == 0:
            raise ValueError("No valid unrated items found")

        # Load rated items from the path
        rated_items = get_rated_items(items_directory, ratings)

        return rated_items, unrated_items, self.__calc_unrated_baglist(unrated_items)
Пример #6
0
    def predict(self, user_id: str, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        1) Goes into items_directory and for each item takes the values corresponding to the field_representation of
        the item_field. For example, if item_field == "Plot" and field_representation == "tf-idf", the function will
        take the "tf-idf" representation of each  "Plot" field for every rated item, the tf-idf representation of rated items
        and items to classify will be parsed to dense arrays;
        2) Define target features, items with rating greater (lower) than threshold will be used as positive(negative) examples;
        3) Creates an object Classifier, uses the method fit and predicts the class of the new items

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
            The predicted classes, or the predict values.
        """

        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list]

        rated_features_bag_list = []
        unrated_features_bag_list = []

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if self.__threshold == -1:
            threshold = pd.to_numeric(ratings["score"], downcast="float").mean()
        else:
            threshold = self.__threshold

        labels = []
        for item in rated_items:
            if item is not None:
                rated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())
                labels.append(1 if float(ratings[ratings['to_id'] == item.get_content_id()].score) >= threshold else 0)

        logger.info("Labeling examples")
        for item in unrated_items:
            if item is not None:
                unrated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())

        clf = None
        if self.__classifier.lower() == "random_forest":
            clf = RandomForestClassifier(n_estimators=400, random_state=42)
        elif self.__classifier.lower() == "svm":
            clf = CalibratedClassifierCV(LinearSVC(random_state=42))
        elif self.__classifier.lower() == "log_regr":
            clf = LogisticRegression(random_state=42)
        elif self.__classifier.lower() == "knn":
            clf = neighbors.KNeighborsClassifier()
        elif self.__classifier.lower() == "decision_tree":
            clf = DecisionTreeClassifier(random_state=42)
        elif self.__classifier.lower() == "gaussian_process":
            clf = GaussianProcessClassifier(random_state=42)

        logger.info("Fitting classifier")
        if self.__classifier.lower() == "gaussian_process":
            pipe = make_pipeline(DictVectorizer(sparse=True), FunctionTransformer(lambda x: x.todense(), accept_sparse=True), clf)
        else:
            pipe = make_pipeline(DictVectorizer(sparse=True), clf)

        pipe = pipe.fit(rated_features_bag_list, labels)

        columns = ["to_id", "rating"]
        score_frame = pd.DataFrame(columns=columns)

        logger.info("Predicting scores")
        score_labels = pipe.predict_proba(unrated_features_bag_list)

        for score, item in zip(score_labels, unrated_items):
            if item is not None:
                score_frame = pd.concat([score_frame, pd.DataFrame.from_records([(item.get_content_id(), score[1])], columns=columns)], ignore_index=True)

        score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True)
        score_frame = score_frame[:recs_number]

        return score_frame
Пример #7
0
    def predict(self,
                user_id: str,
                ratings: pd.DataFrame,
                recs_number: int,
                items_directory: str,
                candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        Checks:
        1) Checks if the representation corresponding to field_representation exists
        2) Checks if the field representation is a document embedding (whose shape equals 1)

        Example: item_field == "Plot" and field_representation == "1", the function will check if the "01"
        representation of each "Plot" field is a document embedding or a tf-idf words bag, and then use the embedding
        or the frequency vector for algorithm computation.

        Computes the centroid of the positive rated items representations

        For each candidate item:
        1) Takes the embedding arrays
        2) Determines the similarity between the centroid and the field_representation of the item_field in candidate item.

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
             scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id), and the similarities between the
                  items and the centroid (rating)
        """

        # try:
        logger.info("Retrieving candidate items")
        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [
                load_content_instance(items_directory, item_id)
                for item_id in candidate_item_id_list
            ]

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if len(rated_items) == 0:
            columns = ["to_id", "rating"]
            scores = pd.DataFrame(columns=columns)
            return scores
        first_item = rated_items[0]
        need_vectorizer = False
        if self.item_field not in first_item.field_dict:
            raise ValueError("The field name specified could not be found!")
        else:
            try:
                representation = first_item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation)
            except KeyError:
                raise ValueError(
                    "The given representation id wasn't found for the specified field"
                )

            if not isinstance(representation,
                              EmbeddingField) and not isinstance(
                                  representation, FeaturesBagField):
                raise ValueError(
                    "The given representation must be an embedding or a tf-idf vector"
                )

            if isinstance(representation, EmbeddingField):
                if len(representation.value.shape) != 1:
                    raise ValueError(
                        "The specified representation is not a document embedding, so the centroid"
                        " can not be calculated")

            if isinstance(representation, FeaturesBagField):
                need_vectorizer = True

        columns = ["to_id", "rating"]
        scores = pd.DataFrame(columns=columns)

        if not need_vectorizer:
            logger.info("Computing centroid")
            centroid = self.__get_centroid_without_vectorizer(
                ratings, rated_items)
            logger.info("Computing similarities")

            for item in unrated_items:
                item_id = item.content_id
                item_field_representation = item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation).value
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    DenseVector(centroid),
                    DenseVector(item_field_representation))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)
        else:
            logger.info("Computing centroid")
            centroid, unrated_matrix = self.__get_centroid_with_vectorizer(
                ratings, rated_items, unrated_items)

            logger.info("Computing similarities")

            a = []
            for x in unrated_items:
                if x is not None:
                    a.append(x)
            unrated_items = a

            for item, item_array in zip(unrated_items, unrated_matrix):
                item_id = item.content_id
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    SparseVector(centroid), SparseVector(item_array))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)

        scores = scores.sort_values(['rating'],
                                    ascending=False).reset_index(drop=True)
        scores = scores[:recs_number]

        return scores