예제 #1
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            interactions_feature_weighting="none",
            **similarity_args):

        if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES,
                        interactions_feature_weighting))

        if interactions_feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif interactions_feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        super().fit(topK=topK,
                    shrink=shrink,
                    similarity=similarity,
                    normalize=normalize,
                    feature_weighting=feature_weighting,
                    **similarity_args)
예제 #2
0
    def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format(
                    self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize,
                                        similarity=similarity, **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
예제 #3
0
 def get_model(cls, URM_train, ICM_train):
     from course_lib.MatrixFactorization.PureSVDRecommender import PureSVDRecommender
     from course_lib.Base.IR_feature_weighting import TF_IDF
     URM_train_side_info = TF_IDF(sps.vstack([URM_train,
                                              ICM_train.T])).tocsr()
     model = PureSVDRecommender(URM_train_side_info)
     model.fit(**cls.get_best_parameters())
     return model
예제 #4
0
 def get_model(cls, URM_train, ICM_train, apply_tf_idf=True):
     from src.model.MatrixFactorization.NewPureSVDRecommender import NewPureSVDRecommender
     from course_lib.Base.IR_feature_weighting import TF_IDF
     if apply_tf_idf:
         URM_train_side_info = TF_IDF(sps.vstack([URM_train,
                                                  ICM_train.T])).tocsr()
     else:
         URM_train_side_info = sps.vstack([URM_train, ICM_train.T]).tocsr()
     model = NewPureSVDRecommender(URM_train_side_info)
     model.fit(**cls.get_best_parameters())
     return model
예제 #5
0
    def get_model(cls, URM_train, ICM_train, apply_tf_idf=True):
        from course_lib.Base.IR_feature_weighting import TF_IDF

        if apply_tf_idf:
            URM_train_side_info = TF_IDF(sps.vstack([URM_train,
                                                     ICM_train.T])).tocsr()
        else:
            URM_train_side_info = sps.vstack([URM_train, ICM_train.T]).tocsr()

        from course_lib.GraphBased.RP3betaRecommender import RP3betaRecommender
        model = RP3betaRecommender(URM_train_side_info)
        model.fit(**cls.get_best_parameters())
        return model
예제 #6
0
    def fit(self,
            topK=50,
            shrink=100,
            normalize=True,
            feature_weighting="none"):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        denominator = 1 if shrink == 0 else shrink

        self.W_sparse = self.URM_train.dot(
            self.URM_train.T) * (1 / denominator)

        if self.topK >= 0:
            self.W_sparse = userSimilarityMatrixTopK(self.W_sparse,
                                                     k=self.topK).tocsr()

        if normalize:
            self.W_sparse = normalize_sk(self.W_sparse, norm="l1", axis=1)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')
예제 #7
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.topComputeK = topK + len(self.cold_users)
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        similarity = Compute_Similarity(self.UCM_train.T,
                                        shrink=shrink,
                                        topK=self.topComputeK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = self.W_sparse.tocsc()

        for user in self.cold_users:
            self.W_sparse.data[self.W_sparse.indptr[user]:self.W_sparse.
                               indptr[user + 1]] = 0

        self.W_sparse.eliminate_zeros()
        self.W_sparse = self.W_sparse.tocsr()

        self.W_sparse = similarityMatrixTopK(self.W_sparse,
                                             k=self.topK).tocsr()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')

        # Add identity matrix to the recommender
        self.recommender.W_sparse = self.recommender.W_sparse + sps.identity(
            self.recommender.W_sparse.shape[0], format="csr")
예제 #8
0
class UserKNNDotCFRecommender(BaseUserSimilarityMatrixRecommender):
    """ UserKNNDotCFRecommender """

    RECOMMENDER_NAME = "UserKNNDotCFRecommender"

    FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"]

    def __init__(self, URM_train, verbose=True):
        super(UserKNNDotCFRecommender, self).__init__(URM_train,
                                                      verbose=verbose)

    def fit(self,
            topK=50,
            shrink=100,
            normalize=True,
            feature_weighting="none"):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        denominator = 1 if shrink == 0 else shrink

        self.W_sparse = self.URM_train.dot(
            self.URM_train.T) * (1 / denominator)

        if self.topK >= 0:
            self.W_sparse = userSimilarityMatrixTopK(self.W_sparse,
                                                     k=self.topK).tocsr()

        if normalize:
            self.W_sparse = normalize_sk(self.W_sparse, norm="l1", axis=1)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')
예제 #9
0
    def precompute_best_item_indices(self, URM: sps.csr_matrix):
        URM = URM.copy()
        if self.feature_weighting == "BM25":
            URM = URM.astype(np.float32)
            URM = okapi_BM_25(URM)
            URM = check_matrix(URM, 'csr')

        elif self.feature_weighting == "TF-IDF":
            URM = URM.astype(np.float32)
            URM = TF_IDF(URM)
            URM = check_matrix(URM, 'csr')

        similarity = Compute_Similarity(URM,
                                        shrink=self.shrink,
                                        topK=self.topK,
                                        normalize=self.normalize,
                                        similarity="cosine")
        similarity_matrix = similarity.compute_similarity()
        self.sorted_indices = np.array(
            np.argsort(-similarity_matrix.todense(), axis=1))
예제 #10
0
def apply_feature_weighting(matrix, feature_weighting="none"):
    from course_lib.Base.IR_feature_weighting import okapi_BM_25, TF_IDF
    from course_lib.Base.Recommender_utils import check_matrix

    FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"]

    if feature_weighting not in FEATURE_WEIGHTING_VALUES:
        raise ValueError(
            "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
            .format(FEATURE_WEIGHTING_VALUES, feature_weighting))

    if feature_weighting == "BM25":
        matrix = matrix.astype(np.float32)
        matrix = okapi_BM_25(matrix)
        matrix = check_matrix(matrix, 'csr')
    elif feature_weighting == "TF-IDF":
        matrix = matrix.astype(np.float32)
        matrix = TF_IDF(matrix)
        matrix = check_matrix(matrix, 'csr')
    return matrix
예제 #11
0
    def get_model(cls,
                  URM_train,
                  ICM_train,
                  load_model=False,
                  save_model=False):
        from course_lib.Base.IR_feature_weighting import TF_IDF
        from course_lib.GraphBased.RP3betaRecommender import RP3betaRecommender
        URM_train_side_info = TF_IDF(sps.vstack([URM_train,
                                                 ICM_train.T])).tocsr()
        model = RP3betaRecommender(URM_train_side_info)

        try:
            if load_model:
                model = cls._load_model(model)
                return model
        except FileNotFoundError as e:
            print("WARNING: Cannot find model to be loaded")

        model.fit(**cls.get_best_parameters())
        if save_model:
            cls._save_model(model)
        return model
예제 #12
0
def main():
    args = get_arguments()

    # Data loading
    data_reader = RecSys2019Reader(args.reader_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False,
                                               force_new_split=True, seed=args.seed)
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    if args.recommender_name == "sslim_bpr":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")
    if args.recommender_name == "rp3beta_side":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")
        URM_train = TF_IDF(URM_train).tocsr()
    if args.recommender_name == "pure_svd":
        URM_train = TF_IDF(URM_train).tocsr()
    if args.recommender_name == "pure_svd_side":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")

    # Setting evaluator
    exclude_cold_users = args.exclude_users
    h = int(args.focus_on_high)
    fol = int(args.focus_on_low)
    if h != 0:
        print("Excluding users with less than {} interactions".format(h))
        ignore_users_mask = np.ediff1d(URM_train.tocsr().indptr) < h
        ignore_users = np.arange(URM_train.shape[0])[ignore_users_mask]
    elif fol != 0:
        print("Excluding users with more than {} interactions".format(fol))
        warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > fol
        ignore_users = np.arange(URM_train.shape[0])[warm_users_mask]
        if exclude_cold_users:
            cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
            cold_users = np.arange(URM_train.shape[0])[cold_user_mask]
            ignore_users = np.unique(np.concatenate((cold_users, ignore_users)))
    elif exclude_cold_users:
        print("Excluding cold users...")
        cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
        ignore_users = np.arange(URM_train.shape[0])[cold_user_mask]
    else:
        ignore_users = None

    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list, ignore_users=ignore_users)

    # HP tuning
    print("Start tuning...")
    version_path = "../../report/hp_tuning/{}/".format(args.recommender_name)
    now = datetime.now().strftime('%b%d_%H-%M-%S')
    now = now + "_k_out_value_3/"
    version_path = version_path + "/" + now

    runParameterSearch_Collaborative(URM_train=URM_train,
                                     recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                     evaluator_validation=evaluator,
                                     metric_to_optimize="MAP",
                                     output_folder_path=version_path,
                                     n_cases=int(args.n_cases),
                                     n_random_starts=int(args.n_random_starts))
    print("...tuning ended")
    def fit(self,
            user_topK=50,
            user_shrink=100,
            user_similarity_type='cosine',
            user_normalize=True,
            user_feature_weighting="none",
            user_asymmetric_alpha=0.5,
            item_topK=50,
            item_shrink=100,
            item_similarity_type='cosine',
            item_normalize=True,
            item_feature_weighting="none",
            item_asymmetric_alpha=0.5,
            interactions_feature_weighting="none"):

        if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES,
                        interactions_feature_weighting))

        if interactions_feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif interactions_feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        # User Similarity Computation
        self.user_topK = user_topK
        self.user_shrink = user_shrink

        if user_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, user_feature_weighting))

        if user_feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif user_feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        kwargs = {"asymmetric_alpha": user_asymmetric_alpha}
        user_similarity_compute = Compute_Similarity(
            self.UCM_train.T,
            shrink=user_shrink,
            topK=user_topK,
            normalize=user_normalize,
            similarity=user_similarity_type,
            **kwargs)

        self.user_W_sparse = user_similarity_compute.compute_similarity()
        self.user_W_sparse = check_matrix(self.user_W_sparse, format='csr')

        # Item Similarity Computation
        self.item_topK = item_topK
        self.item_shrink = item_shrink

        if item_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, item_feature_weighting))

        if item_feature_weighting == "BM25":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = okapi_BM_25(self.ICM_train)

        elif item_feature_weighting == "TF-IDF":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = TF_IDF(self.ICM_train)

        kwargs = {"asymmetric_alpha": item_asymmetric_alpha}
        item_similarity_compute = Compute_Similarity(
            self.ICM_train.T,
            shrink=item_shrink,
            topK=item_topK,
            normalize=item_normalize,
            similarity=item_similarity_type,
            **kwargs)

        self.item_W_sparse = item_similarity_compute.compute_similarity()
        self.item_W_sparse = check_matrix(self.item_W_sparse, format='csr')
class UserItemCBFCFDemographicRecommender(BaseRecommender):
    """ UserItem KNN CBF & CF & Demographic Recommender"""

    RECOMMENDER_NAME = "UserItemCBFCFDemographicRecommender"

    FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"]

    def __init__(self, URM_train, UCM_train, ICM_train, verbose=True):
        super(UserItemCBFCFDemographicRecommender,
              self).__init__(URM_train, verbose=verbose)

        self._URM_train_format_checked = False
        self._user_W_sparse_format_checked = False
        self._item_W_sparse_format_checked = False

        self.UCM_train = sps.hstack([UCM_train, URM_train], format="csr")
        self.ICM_train = sps.hstack([ICM_train, URM_train.T], format="csr")

    def fit(self,
            user_topK=50,
            user_shrink=100,
            user_similarity_type='cosine',
            user_normalize=True,
            user_feature_weighting="none",
            user_asymmetric_alpha=0.5,
            item_topK=50,
            item_shrink=100,
            item_similarity_type='cosine',
            item_normalize=True,
            item_feature_weighting="none",
            item_asymmetric_alpha=0.5,
            interactions_feature_weighting="none"):

        if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES,
                        interactions_feature_weighting))

        if interactions_feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif interactions_feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        # User Similarity Computation
        self.user_topK = user_topK
        self.user_shrink = user_shrink

        if user_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, user_feature_weighting))

        if user_feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif user_feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        kwargs = {"asymmetric_alpha": user_asymmetric_alpha}
        user_similarity_compute = Compute_Similarity(
            self.UCM_train.T,
            shrink=user_shrink,
            topK=user_topK,
            normalize=user_normalize,
            similarity=user_similarity_type,
            **kwargs)

        self.user_W_sparse = user_similarity_compute.compute_similarity()
        self.user_W_sparse = check_matrix(self.user_W_sparse, format='csr')

        # Item Similarity Computation
        self.item_topK = item_topK
        self.item_shrink = item_shrink

        if item_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, item_feature_weighting))

        if item_feature_weighting == "BM25":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = okapi_BM_25(self.ICM_train)

        elif item_feature_weighting == "TF-IDF":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = TF_IDF(self.ICM_train)

        kwargs = {"asymmetric_alpha": item_asymmetric_alpha}
        item_similarity_compute = Compute_Similarity(
            self.ICM_train.T,
            shrink=item_shrink,
            topK=item_topK,
            normalize=item_normalize,
            similarity=item_similarity_type,
            **kwargs)

        self.item_W_sparse = item_similarity_compute.compute_similarity()
        self.item_W_sparse = check_matrix(self.item_W_sparse, format='csr')

    def _compute_item_score(self, user_id_array, items_to_compute=None):
        """
        URM_train and W_sparse must have the same format, CSR
        :param user_id_array:
        :param items_to_compute: not implemented!!
        :return:
        """

        self._check_format()

        user_weights_array = self.user_W_sparse[user_id_array]

        if items_to_compute is not None:
            item_scores_user_similarity = -np.ones(
                (len(user_id_array), self.URM_train.shape[1]),
                dtype=np.float32) * np.inf
            item_scores_user_similarity_all = user_weights_array.dot(
                self.URM_train).toarray()
            item_scores_user_similarity[:,
                                        items_to_compute] = item_scores_user_similarity_all[:,
                                                                                            items_to_compute]
        else:
            item_scores_user_similarity = user_weights_array.dot(
                self.URM_train)

        user_profile_array = item_scores_user_similarity + self.URM_train[
            user_id_array]

        if items_to_compute is not None:
            item_scores_item_similarity = -np.ones(
                (len(user_id_array), self.URM_train.shape[1]),
                dtype=np.float32) * np.inf
            item_scores_item_similarity_all = user_profile_array.dot(
                self.item_W_sparse).toarray()
            item_scores_item_similarity[:,
                                        items_to_compute] = item_scores_item_similarity_all[:,
                                                                                            items_to_compute]
        else:
            item_scores_item_similarity = user_profile_array.dot(
                self.item_W_sparse).toarray()

        return item_scores_item_similarity

    def _check_format(self):
        if not self._URM_train_format_checked:

            if self.URM_train.getformat() != "csr":
                self._print(
                    "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation."
                    .format("URM_train", "csr"))

            self._URM_train_format_checked = True

        if not self._item_W_sparse_format_checked:

            if self.item_W_sparse.getformat() != "csr":
                self._print(
                    "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down "
                    "the computation.".format("item_W_sparse", "csr"))

            self._item_W_sparse_format_checked = True

        if not self._user_W_sparse_format_checked:

            if self.user_W_sparse.getformat() != "csr":
                self._print(
                    "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down "
                    "the computation.".format("user__sparse", "csr"))

            self._user_W_sparse_format_checked = True

    def save_model(self, folder_path, file_name=None):
        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        self._print("Saving model in file '{}'".format(folder_path +
                                                       file_name))

        data_dict_to_save = {
            "user_W_sparse": self.user_W_sparse,
            "item_W_sparse": self.item_W_sparse
        }

        dataIO = DataIO(folder_path=folder_path)
        dataIO.save_data(file_name=file_name,
                         data_dict_to_save=data_dict_to_save)

        self._print("Saving complete")
예제 #15
0
    def fit(self,
            alpha=1.,
            beta=0.6,
            min_rating=0,
            topK=100,
            implicit=False,
            normalize_similarity=True):

        self.alpha = alpha
        self.beta = beta
        self.min_rating = min_rating
        self.topK = topK
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        # if X.dtype != np.float32:
        #     print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size,
                                              dtype=np.float32)

        # Pui is the row-normalized urm
        Pui_raw = sps.hstack([self.URM_train, self.UCM_train], format="csr")
        Pui_raw = TF_IDF(Pui_raw).tocsr()
        Pui = normalize(Pui_raw, norm='l1', axis=1)

        # Piu is the column-normalized, "boolean" urm transposed
        # X_bool = self.URM_train.transpose(copy=True)
        X_bool = Pui_raw.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)

        # Taking the degree of each item to penalize top popular
        # Some rows might be zero, make sure their degree remains zero
        X_bool_sum = np.array(X_bool.sum(axis=1)).ravel()

        degree = np.zeros(Pui_raw.shape[1])

        nonZeroMask = X_bool_sum != 0.0

        degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta)

        # ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del (X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[
                current_block_start_row:current_block_start_row +
                block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = np.multiply(similarity_block[row_in_block, :],
                                       degree)
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate(
                            (rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate(
                            (cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate(
                            (values, np.zeros(dataBlock, dtype=np.float32)))

                    rows[numCells] = current_block_start_row + row_in_block
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1

            if time.time() - start_time_printBatch > 60:
                self._print(
                    "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}"
                    .format(
                        current_block_start_row,
                        100.0 * float(current_block_start_row) / Pui.shape[1],
                        (time.time() - start_time) / 60,
                        float(current_block_start_row) /
                        (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(Pui.shape[1], Pui.shape[1]))

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)

        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')