Пример #1
0
    def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting = "none", **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError("Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))


        if feature_weighting == "BM25":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif feature_weighting == "TF-IDF":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)


        similarity = Compute_Similarity(self.ICM.T, shrink=shrink, topK=topK, normalize=normalize, similarity = similarity, **similarity_args)


        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()
Пример #2
0
    def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, **similarity_args):

        self.topK = topK
        self.shrink = shrink

        similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity = similarity, **similarity_args)

        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()
    def fit(self, topK=350, shrink=10, similarity='cosine', normalize=True, force_compute_sim=True, tfidf=True,
            **similarity_args):

        self.topK = topK
        self.shrink = shrink
        self.tfidf = tfidf

        if not force_compute_sim:
            found = True
            try:
                with open(os.path.join("IntermediateComputations", "ItemCF",
                                       "tot={}_tokK={}_shrink={}_tfidf={}.pkl".format(str(len(self.URM_train.data)),
                                                                                      str(self.topK), str(self.shrink),
                                                                                      str(self.tfidf))),
                          'rb') as handle:
                    (topK_new, shrink_new, W_sparse_new) = pickle.load(handle)
            except FileNotFoundError:
                print("File {} not found".format(
                    os.path.join("IntermediateComputations", "ItemCF",
                                 "tot={}_tokK={}_shrink={}_tfidf={}.pkl".format(str(len(self.URM_train.data)),
                                                                                str(self.topK), str(self.shrink),
                                                                                str(self.tfidf)))))
                found = False

            if found and self.topK == topK_new and self.shrink == shrink_new:
                self.W_sparse = W_sparse_new
                print("Saved Item CF Similarity Matrix Used!")
                return

        if tfidf:
            sim_matrix_pre = get_tfidf(self.URM_train)
        else:
            sim_matrix_pre = self.URM_train

        similarity = Compute_Similarity(sim_matrix_pre, shrink=shrink, topK=topK, normalize=normalize,
                                        similarity=similarity, **similarity_args)

        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()
            print('Similarity item based CF computed')
            with open(os.path.join("IntermediateComputations", "ItemCF",
                                   "tot={}_tokK={}_shrink={}_tfidf={}.pkl".format(str(len(self.URM_train.data)),
                                                                                  str(self.topK), str(self.shrink),
                                                                                  str(self.tfidf))),
                      'wb') as handle:
                pickle.dump((self.topK, self.shrink, self.W_sparse), handle, protocol=pickle.HIGHEST_PROTOCOL)
                print("Item CF similarity matrix saved")
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        similarity = Compute_Similarity(self.URM_train,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Пример #5
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm="l2", axis=1)
            self.URM_train = normalize(self.URM_train, norm="l2", axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X X^t, compute dot product
        similarity = Compute_Similarity(
            self.URM_train,
            shrink=0,
            topK=self.URM_train.shape[1],
            normalize=False,
            similarity="cosine",
        )
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        grahm_matrix[diag_indices] += l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        # Check if the matrix should be saved in a sparse or dense format
        # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota %
        if topK is not None:
            B = similarityMatrixTopK(B, k=topK, verbose=False)

        if self._is_content_sparse_check(B):
            self._print("Detected model matrix to be sparse, changing format.")
            self.W_sparse = check_matrix(B, format="csr", dtype=np.float32)

        else:
            self.W_sparse = check_matrix(B, format="npy", dtype=np.float32)
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense
Пример #6
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X^t X, compute dot product
        similarity = Compute_Similarity(self.URM_train,
                                        shrink=0,
                                        topK=self.URM_train.shape[1],
                                        normalize=False,
                                        similarity="cosine")
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero
        # in this case we need the diagonal as well, which is just the item popularity
        item_popularity = np.ediff1d(self.URM_train.tocsc().indptr)
        grahm_matrix[diag_indices] = item_popularity + l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        if topK is None:
            self.W_sparse = B
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense

        else:
            self.W_sparse = similarityMatrixTopK(B, k=topK, verbose=False)
            self.W_sparse = sps.csr_matrix(self.W_sparse)
Пример #7
0
class CFW_D_Similarity_Linalg(BaseItemSimilarityMatrixRecommender):

    RECOMMENDER_NAME = "CFW_D_Similarity_Linalg"

    def __init__(self, URM_train, ICM, S_matrix_target):

        super(CFW_D_Similarity_Linalg, self).__init__(URM_train)

        if URM_train.shape[1] != ICM.shape[0]:
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM.shape[0]))

        if S_matrix_target.shape[0] != S_matrix_target.shape[1]:
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if S_matrix_target.shape[0] != ICM.shape[0]:
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM.shape[0]))

        self.S_matrix_target = check_matrix(S_matrix_target, "csr")
        self.ICM = check_matrix(ICM, "csr")

        self.n_items = self.URM_train.shape[1]
        self.n_users = self.URM_train.shape[0]
        self.n_features = self.ICM.shape[1]

        self.sparse_weights = True

    def _writeLog(self, string):

        print(string)
        sys.stdout.flush()
        sys.stderr.flush()

        if self.logFile is not None:
            self.logFile.write(string + "\n")
            self.logFile.flush()

    def _generateTrainData_low_ram(self):

        print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.similarity = Compute_Similarity(self.ICM.T,
                                             shrink=0,
                                             topK=self.topK,
                                             normalize=False)
        S_matrix_contentKNN = self.similarity.compute_similarity()
        S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr")

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2,
                self.S_matrix_target.nnz,
            ))

        self._writeLog(self.RECOMMENDER_NAME +
                       ": Content S density: {:.2E}, nonzero cells {}".format(
                           S_matrix_contentKNN.nnz /
                           S_matrix_contentKNN.shape[0]**2,
                           S_matrix_contentKNN.nnz,
                       ))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= (sum_of_squared_features[row_index] *
                                           sum_of_squared_features[col_index])

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if (time.time() - start_time_batch > 30
                    or num_samples == S_matrix_contentKNN.nnz *
                (1 + self.add_zeros_quota)):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ( {:.2f} %) ".format(
                          num_samples,
                          num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100,
                      ))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells"
            .format(
                num_common_coordinates,
                S_matrix_contentKNN.nnz,
                num_common_coordinates / S_matrix_contentKNN.nnz * 100,
            ))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz, collaborative_sum /
                collaborative_nnz))

    def fit(
        self,
        show_max_performance=False,
        logFile=None,
        loss_tolerance=1e-6,
        iteration_limit=50000,
        damp_coeff=0.0,
        topK=300,
        add_zeros_quota=0.0,
        normalize_similarity=False,
    ):

        self.logFile = logFile
        self.normalize_similarity = normalize_similarity

        self.add_zeros_quota = add_zeros_quota
        self.topK = topK

        self._generateTrainData_low_ram()

        commonFeatures = self.ICM[self.row_list].multiply(
            self.ICM[self.col_list])

        linalg_result = linalg.lsqr(
            commonFeatures,
            self.data_list,
            show=False,
            atol=loss_tolerance,
            btol=loss_tolerance,
            iter_lim=iteration_limit,
            damp=damp_coeff,
        )

        # res = linalg.lsmr(commonFeatures, self.data_list, show = False, atol=loss_tolerance, btol=loss_tolerance,
        #                   maxiter = iteration_limit, damp=damp_coeff)

        self.D_incremental = linalg_result[0].copy()
        self.D_best = linalg_result[0].copy()
        self.epochs_best = 0

        self.loss = linalg_result[3]

        self._compute_W_sparse()

    def _compute_W_sparse(self, use_incremental=False):

        if use_incremental:
            feature_weights = self.D_incremental
        else:
            feature_weights = self.D_best

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=feature_weights,
        )

        self.W_sparse = self.similarity.compute_similarity()
        self.sparse_weights = True

    def save_model(self, folder_path, file_name=None):

        import pickle

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME,
                                                     folder_path + file_name))

        dictionary_to_save = {
            "D_best": self.D_best,
            "topK": self.topK,
            "sparse_weights": self.sparse_weights,
            "W_sparse": self.W_sparse,
            "normalize_similarity": self.normalize_similarity,
        }

        pickle.dump(
            dictionary_to_save,
            open(folder_path + file_name, "wb"),
            protocol=pickle.HIGHEST_PROTOCOL,
        )

        print("{}: Saving complete".format(self.RECOMMENDER_NAME))
Пример #8
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            force_compute_sim=True,
            feature_weighting="none",
            feature_weighting_index=0,
            **similarity_args):

        self.feature_weighting_index = feature_weighting_index
        feature_weighting = self.FEATURE_WEIGHTING_VALUES[
            feature_weighting_index]
        self.topK = topK
        self.shrink = shrink

        if not force_compute_sim:
            found = True
            try:
                with open(
                        os.path.join(
                            "IntermediateComputations", "ICB",
                            "tot={}_topK={}_shrink={}_featureweight={}.pkl".
                            format(str(len(self.URM_train.data)),
                                   str(self.topK), str(self.shrink),
                                   str(self.feature_weighting_index))),
                        'rb') as handle:
                    (topK_new, shrink_new, W_sparse_new) = pickle.load(handle)
            except FileNotFoundError:
                print("File {} not found".format(
                    os.path.join("IntermediateComputations",
                                 "ContentBFMatrix.pkl")))
                found = False

            if found and self.topK == topK_new and self.shrink == shrink_new:
                self.W_sparse = W_sparse_new
                print("Saved CBF Similarity Matrix Used!")
                return

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif feature_weighting == "TF-IDF":
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        similarity = Compute_Similarity(self.ICM.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        if self.sparse_weights:
            self.W_sparse = similarity.compute_similarity()

            with open(
                    os.path.join(
                        "IntermediateComputations", "ICB",
                        "tot={}_topK={}_shrink={}_featureweight={}.pkl".format(
                            str(len(self.URM_train.data)), str(self.topK),
                            str(self.shrink),
                            str(self.feature_weighting_index))),
                    'wb') as handle:
                pickle.dump((self.topK, self.shrink, self.W_sparse),
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
                print("CBF similarity matrix saved")
        else:
            self.W = similarity.compute_similarity()
            self.W = self.W.toarray()
Пример #9
0
class CFW_D_Similarity_Cython(Incremental_Training_Early_Stopping,
                              SimilarityMatrixRecommender, Recommender):

    RECOMMENDER_NAME = "CFW_D_Similarity_Cython"

    INIT_TYPE_VALUES = ["random", "one", "zero", "BM25", "TF-IDF"]

    def __init__(self,
                 URM_train,
                 ICM,
                 S_matrix_target,
                 recompile_cython=False):

        super(CFW_D_Similarity_Cython, self).__init__()

        if (URM_train.shape[1] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM.shape[0]))

        self.URM_train = check_matrix(URM_train, 'csr')
        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
        self.ICM = check_matrix(ICM, 'csr')

        self.n_items = self.URM_train.shape[1]
        self.n_users = self.URM_train.shape[0]
        self.n_features = self.ICM.shape[1]

        self.sparse_weights = True

        if recompile_cython:
            print("Compiling in Cython")
            self.runCompilationScript()
            print("Compilation Complete")

    def runCompilationScript(self):

        # Run compile script setting the working directory to ensure the compiled file are contained in the
        # appropriate subfolder and not the project root

        compiledModuleSubfolder = "/FW_Similarity/Cython"
        fileToCompile = 'CFW_D_Similarity_Cython_SGD.pyx'

        command = [
            'python', 'compileCython.py', fileToCompile, 'build_ext',
            '--inplace'
        ]

        output = subprocess.check_output(' '.join(command),
                                         shell=True,
                                         cwd=os.getcwd() +
                                         compiledModuleSubfolder)

        try:

            command = ['cython', fileToCompile, '-a']

            output = subprocess.check_output(' '.join(command),
                                             shell=True,
                                             cwd=os.getcwd() +
                                             compiledModuleSubfolder)

        except:
            pass

        print("Compiled module saved in subfolder: {}".format(
            compiledModuleSubfolder))

        # Command to run compilation script
        # python compileCython.py CFW_D_Similarity_Cython_SGD.pyx build_ext --inplace

        # Command to generate html report
        # cython -a CFW_D_Similarity_Cython_SGD.pyx

    def generateTrainData_low_ram(self):

        print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.similarity = Compute_Similarity(self.ICM.T,
                                             shrink=0,
                                             topK=self.topK,
                                             normalize=False)
        S_matrix_contentKNN = self.similarity.compute_similarity()
        S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr")

        self.writeLog(
            self.RECOMMENDER_NAME +
            ": Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz /
                self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz))

        self.writeLog(
            self.RECOMMENDER_NAME +
            ": Content S density: {:.2E}, nonzero cells {}".format(
                S_matrix_contentKNN.nnz /
                S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= sum_of_squared_features[
                            row_index] * sum_of_squared_features[col_index]

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if time.time(
            ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (
                    1 + self.add_zeros_quota):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ( {:.2f} %) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self.writeLog(
            self.RECOMMENDER_NAME +
            ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self.writeLog(
            self.RECOMMENDER_NAME +
            ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz, collaborative_sum /
                collaborative_nnz))

        if self.evaluator_object is not None and self.show_max_performance:
            self.computeMaxTheoreticalPerformance()

    def computeMaxTheoreticalPerformance(self):

        # Max performance would be if we were able to learn the content matrix having for each non-zero cell exactly
        # the value that appears in the collaborative similarity

        print(self.RECOMMENDER_NAME + ": Computing collaborative performance")

        recommender = ItemKNNCustomSimilarityRecommender()
        recommender.fit(self.S_matrix_target, self.URM_train)

        results_run = self.evaluator_object(recommender)

        self.writeLog(self.RECOMMENDER_NAME +
                      ": Collaborative performance is: {}".format(results_run))

        print(self.RECOMMENDER_NAME + ": Computing top structural performance")

        n_items = self.ICM.shape[0]

        S_optimal = sps.csr_matrix(
            (self.data_list, (self.row_list, self.col_list)),
            shape=(n_items, n_items))
        S_optimal.eliminate_zeros()

        recommender = ItemKNNCustomSimilarityRecommender()
        recommender.fit(S_optimal, self.URM_train)

        results_run = self.evaluator_object(recommender)

        self.writeLog(
            self.RECOMMENDER_NAME +
            ": Top structural performance is: {}".format(results_run))

    def writeLog(self, string):

        print(string)
        sys.stdout.flush()
        sys.stderr.flush()

        if self.logFile is not None:
            self.logFile.write(string + "\n")
            self.logFile.flush()

    def compute_W_sparse(self, use_incremental=False):

        if use_incremental:
            feature_weights = self.D_incremental
        else:
            feature_weights = self.D_best

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=feature_weights)

        self.W_sparse = self.similarity.compute_similarity()
        self.sparse_weights = True

    def set_ICM_and_recompute_W(self, ICM_new, recompute_w=True):

        self.ICM = ICM_new.copy()

        if recompute_w:
            self.compute_W_sparse(use_incremental=False)

    def fit(self,
            validation_every_n=5,
            show_max_performance=False,
            logFile=None,
            precompute_common_features=True,
            learning_rate=0.01,
            positive_only_weights=True,
            init_type="zero",
            normalize_similarity=False,
            use_dropout=True,
            dropout_perc=0.3,
            l1_reg=0.0,
            l2_reg=0.0,
            epochs=50,
            topK=300,
            add_zeros_quota=0.0,
            sgd_mode='adagrad',
            gamma=0.9,
            beta_1=0.9,
            beta_2=0.999,
            stop_on_validation=False,
            lower_validatons_allowed=5,
            validation_metric="MAP",
            evaluator_object=None):

        if init_type not in self.INIT_TYPE_VALUES:
            raise ValueError(
                "Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.INIT_TYPE_VALUES, init_type))

        # Import compiled module
        from FW_Similarity.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD

        self.logFile = logFile

        if validation_every_n is not None:
            self.validation_every_n = validation_every_n
        else:
            self.validation_every_n = np.inf

        self.evaluator_object = evaluator_object

        self.show_max_performance = show_max_performance
        self.positive_only_weights = positive_only_weights
        self.normalize_similarity = normalize_similarity
        self.learning_rate = learning_rate
        self.add_zeros_quota = add_zeros_quota
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.topK = topK

        self.generateTrainData_low_ram()

        weights_initialization = None

        if init_type == "random":
            weights_initialization = np.random.normal(
                0.001, 0.1, self.n_features).astype(np.float64)
        elif init_type == "one":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
        elif init_type == "zero":
            weights_initialization = np.zeros(self.n_features,
                                              dtype=np.float64)
        elif init_type == "BM25":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif init_type == "TF-IDF":
            weights_initialization = np.ones(self.n_features, dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        else:
            raise ValueError(
                "CFW_D_Similarity_Cython: 'init_type' not recognized")

        # Instantiate fast Cython implementation
        self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD(
            self.row_list,
            self.col_list,
            self.data_list,
            self.n_features,
            self.ICM,
            precompute_common_features=precompute_common_features,
            non_negative_weights=self.positive_only_weights,
            weights_initialization=weights_initialization,
            use_dropout=use_dropout,
            dropout_perc=dropout_perc,
            learning_rate=learning_rate,
            l1_reg=l1_reg,
            l2_reg=l2_reg,
            sgd_mode=sgd_mode,
            gamma=gamma,
            beta_1=beta_1,
            beta_2=beta_2)

        print(self.RECOMMENDER_NAME + ": Initialization completed")

        self._train_with_early_stopping(epochs,
                                        validation_every_n,
                                        stop_on_validation,
                                        validation_metric,
                                        lower_validatons_allowed,
                                        evaluator_object,
                                        algorithm_name=self.RECOMMENDER_NAME)

        self.compute_W_sparse()

        sys.stdout.flush()

    def _initialize_incremental_model(self):
        self.D_incremental = self.FW_D_Similarity.get_weights()
        self.D_best = self.D_incremental.copy()

    def _update_incremental_model(self):
        self.D_incremental = self.FW_D_Similarity.get_weights()

    def _update_best_model(self):
        self.D_best = self.D_incremental.copy()

    def _run_epoch(self, num_epoch):
        self.loss = self.FW_D_Similarity.fit()
        self.D_incremental = self.FW_D_Similarity.get_weights()

    def saveModel(self, folder_path, file_name=None):

        import pickle

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME,
                                                     folder_path + file_name))

        dictionary_to_save = {
            "D_best": self.D_best,
            "topK": self.topK,
            "sparse_weights": self.sparse_weights,
            "W_sparse": self.W_sparse,
            "normalize_similarity": self.normalize_similarity
        }

        pickle.dump(dictionary_to_save,
                    open(folder_path + file_name, "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)

        print("{}: Saving complete".format(self.RECOMMENDER_NAME))
class CFW_D_Similarity_Cython(BaseSimilarityMatrixRecommender,
                              Incremental_Training_Early_Stopping):

    RECOMMENDER_NAME = "CFW_D_Similarity_Cython"

    INIT_TYPE_VALUES = ["random", "one", "BM25", "TF-IDF"]

    def __init__(self,
                 URM_train,
                 ICM,
                 S_matrix_target,
                 recompile_cython=False):

        super(CFW_D_Similarity_Cython, self).__init__(URM_train)

        if (URM_train.shape[1] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM.shape[0]))

        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')

        self.ICM = check_matrix(ICM, 'csr')
        self.n_features = self.ICM.shape[1]

        if recompile_cython:
            print("Compiling in Cython")
            self.runCompilationScript()
            print("Compilation Complete")

    def fit(self,
            show_max_performance=False,
            precompute_common_features=False,
            learning_rate=0.1,
            positive_only_D=True,
            initialization_mode_D="random",
            normalize_similarity=False,
            use_dropout=True,
            dropout_perc=0.3,
            l1_reg=0.0,
            l2_reg=0.0,
            epochs=50,
            topK=300,
            add_zeros_quota=0.0,
            log_file=None,
            verbose=False,
            sgd_mode='adagrad',
            gamma=0.9,
            beta_1=0.9,
            beta_2=0.999,
            **earlystopping_kwargs):

        if initialization_mode_D not in self.INIT_TYPE_VALUES:
            raise ValueError(
                "Value for 'initialization_mode_D' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.INIT_TYPE_VALUES, initialization_mode_D))

        # Import compiled module
        from FeatureWeighting.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD

        self.show_max_performance = show_max_performance
        self.normalize_similarity = normalize_similarity
        self.learning_rate = learning_rate
        self.add_zeros_quota = add_zeros_quota
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.topK = topK
        self.log_file = log_file
        self.verbose = verbose

        self._generate_train_data()

        weights_initialization_D = None

        if initialization_mode_D == "random":
            weights_initialization_D = np.random.normal(
                0.001, 0.1, self.n_features).astype(np.float64)
        elif initialization_mode_D == "one":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
        elif initialization_mode_D == "zero":
            weights_initialization_D = np.zeros(self.n_features,
                                                dtype=np.float64)
        elif initialization_mode_D == "BM25":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = okapi_BM_25(self.ICM)

        elif initialization_mode_D == "TF-IDF":
            weights_initialization_D = np.ones(self.n_features,
                                               dtype=np.float64)
            self.ICM = self.ICM.astype(np.float32)
            self.ICM = TF_IDF(self.ICM)

        else:
            raise ValueError(
                "CFW_D_Similarity_Cython: 'init_type' not recognized")

        # Instantiate fast Cython implementation
        self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD(
            self.row_list,
            self.col_list,
            self.data_list,
            self.n_features,
            self.ICM,
            precompute_common_features=precompute_common_features,
            positive_only_D=positive_only_D,
            weights_initialization_D=weights_initialization_D,
            use_dropout=use_dropout,
            dropout_perc=dropout_perc,
            learning_rate=learning_rate,
            l1_reg=l1_reg,
            l2_reg=l2_reg,
            sgd_mode=sgd_mode,
            verbose=self.verbose,
            gamma=gamma,
            beta_1=beta_1,
            beta_2=beta_2)

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Initialization completed")

        self.D_incremental = self.FW_D_Similarity.get_weights()
        self.D_best = self.D_incremental.copy()

        self._train_with_early_stopping(epochs,
                                        algorithm_name=self.RECOMMENDER_NAME,
                                        **earlystopping_kwargs)

        self.compute_W_sparse(model_to_use="best")

        sys.stdout.flush()

    def _prepare_model_for_validation(self):
        self.D_incremental = self.FW_D_Similarity.get_weights()
        self.compute_W_sparse(model_to_use="last")

    def _update_best_model(self):
        self.D_best = self.D_incremental.copy()

    def _run_epoch(self, num_epoch):
        self.loss = self.FW_D_Similarity.fit()

    def _generate_train_data(self):

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.similarity = Compute_Similarity(self.ICM.T,
                                             shrink=0,
                                             topK=self.topK,
                                             normalize=False)
        S_matrix_contentKNN = self.similarity.compute_similarity()
        S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr")

        self.write_log(
            "Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2,
                self.S_matrix_target.nnz))

        self.write_log("Content S density: {:.2E}, nonzero cells {}".format(
            S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2,
            S_matrix_contentKNN.nnz))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= sum_of_squared_features[
                            row_index] * sum_of_squared_features[col_index]

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if self.verbose and (time.time() - start_time_batch > 30
                                 or num_samples == S_matrix_contentKNN.nnz *
                                 (1 + self.add_zeros_quota)):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ( {:.2f} %) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self.write_log(
            "Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self.write_log(
            "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz,
                collaborative_sum / collaborative_nnz))

    def write_log(self, string):

        string = self.RECOMMENDER_NAME + ": " + string

        if self.verbose:
            print(string)
            sys.stdout.flush()
            sys.stderr.flush()

        if self.log_file is not None:
            self.log_file.write(string + "\n")
            self.log_file.flush()

    def compute_W_sparse(self, model_to_use="best"):

        if model_to_use == "last":
            feature_weights = self.D_incremental
        elif model_to_use == "best":
            feature_weights = self.D_best
        else:
            assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format(
                self.RECOMMENDER_NAME)

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=feature_weights)

        self.W_sparse = self.similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')

    def set_ICM_and_recompute_W(self, ICM_new, recompute_w=True):

        self.ICM = ICM_new.copy()

        if recompute_w:
            self.compute_W_sparse(model_to_use="best")

    def runCompilationScript(self):

        # Run compile script setting the working directory to ensure the compiled file are contained in the
        # appropriate subfolder and not the project root

        file_subfolder = "FeatureWeighting/Cython"
        file_to_compile_list = ['CFW_D_Similarity_Cython_SGD.pyx']

        run_compile_subprocess(file_subfolder, file_to_compile_list)

        print("{}: Compiled module {} in subfolder: {}".format(
            self.RECOMMENDER_NAME, file_to_compile_list, file_subfolder))

        # Command to run compilation script
        # python compile_script.py CFW_D_Similarity_Cython_SGD.pyx build_ext --inplace

        # Command to generate html report
        # cython -a CFW_D_Similarity_Cython_SGD.pyx

    def saveModel(self, folder_path, file_name=None):

        import pickle

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME,
                                                     folder_path + file_name))

        dictionary_to_save = {
            "D_best": self.D_best,
            "topK": self.topK,
            "W_sparse": self.W_sparse,
            "normalize_similarity": self.normalize_similarity
        }

        pickle.dump(dictionary_to_save,
                    open(folder_path + file_name, "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)

        print("{}: Saving complete".format(self.RECOMMENDER_NAME))