def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format( self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def _compute_W_sparse(self, use_incremental=False): if use_incremental: feature_weights = self.D_incremental else: feature_weights = self.D_best self.similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=feature_weights) self.W_sparse = self.similarity.compute_similarity() self.sparse_weights = True
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.topComputeK = topK + len(self.cold_users) self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=self.topComputeK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = self.W_sparse.tocsc() for user in self.cold_users: self.W_sparse.data[self.W_sparse.indptr[user]:self.W_sparse. indptr[user + 1]] = 0 self.W_sparse.eliminate_zeros() self.W_sparse = self.W_sparse.tocsr() self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK).tocsr() self.W_sparse = check_matrix(self.W_sparse, format='csr') # Add identity matrix to the recommender self.recommender.W_sparse = self.recommender.W_sparse + sps.identity( self.recommender.W_sparse.shape[0], format="csr")
def precompute_best_item_indices(self, URM: sps.csr_matrix): URM = URM.copy() if self.feature_weighting == "BM25": URM = URM.astype(np.float32) URM = okapi_BM_25(URM) URM = check_matrix(URM, 'csr') elif self.feature_weighting == "TF-IDF": URM = URM.astype(np.float32) URM = TF_IDF(URM) URM = check_matrix(URM, 'csr') similarity = Compute_Similarity(URM, shrink=self.shrink, topK=self.topK, normalize=self.normalize, similarity="cosine") similarity_matrix = similarity.compute_similarity() self.sorted_indices = np.array( np.argsort(-similarity_matrix.todense(), axis=1))
def fit(self, user_topK=50, user_shrink=100, user_similarity_type='cosine', user_normalize=True, user_feature_weighting="none", user_asymmetric_alpha=0.5, item_topK=50, item_shrink=100, item_similarity_type='cosine', item_normalize=True, item_feature_weighting="none", item_asymmetric_alpha=0.5, interactions_feature_weighting="none"): if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, interactions_feature_weighting)) if interactions_feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') elif interactions_feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csr') # User Similarity Computation self.user_topK = user_topK self.user_shrink = user_shrink if user_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, user_feature_weighting)) if user_feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif user_feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) kwargs = {"asymmetric_alpha": user_asymmetric_alpha} user_similarity_compute = Compute_Similarity( self.UCM_train.T, shrink=user_shrink, topK=user_topK, normalize=user_normalize, similarity=user_similarity_type, **kwargs) self.user_W_sparse = user_similarity_compute.compute_similarity() self.user_W_sparse = check_matrix(self.user_W_sparse, format='csr') # Item Similarity Computation self.item_topK = item_topK self.item_shrink = item_shrink if item_feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, item_feature_weighting)) if item_feature_weighting == "BM25": self.ICM_train = self.ICM_train.astype(np.float32) self.ICM_train = okapi_BM_25(self.ICM_train) elif item_feature_weighting == "TF-IDF": self.ICM_train = self.ICM_train.astype(np.float32) self.ICM_train = TF_IDF(self.ICM_train) kwargs = {"asymmetric_alpha": item_asymmetric_alpha} item_similarity_compute = Compute_Similarity( self.ICM_train.T, shrink=item_shrink, topK=item_topK, normalize=item_normalize, similarity=item_similarity_type, **kwargs) self.item_W_sparse = item_similarity_compute.compute_similarity() self.item_W_sparse = check_matrix(self.item_W_sparse, format='csr')
"ICM_sub_class") subclass_content_dict = get_sub_class_content( ICM_subclass, subclass_feature_to_id_mapper, binned=True) subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False) SRM = get_subclass_rating_matrix( URM=URM_train, subclass_content_dict=subclass_content_dict) SRM_implicit = get_subclass_rating_matrix( URM=URM_train, subclass_content_dict=subclass_content_dict, implicit=True) # Compute similarity similarity_SRM = Compute_Similarity(SRM, similarity="cosine") similarity_SRM_implicit = Compute_Similarity(SRM_implicit, similarity=SIMILARITY_TYPE) W_sparse_SRM = similarity_SRM.compute_similarity() W_sparse_SRM_implicit = similarity_SRM_implicit.compute_similarity() W_sparse_dense_SRM = W_sparse_SRM.todense() W_sparse_dense_SRM_implicit = W_sparse_SRM_implicit.todense() print(W_sparse_SRM) # Plots if PLOT_SIMILARITY_MATRIX: plt.title("Plot of {} similarity".format(SIMILARITY_TYPE)) plt.imshow(W_sparse_dense_SRM, interpolation='none', origin="lower")
def _generateTrainData_low_ram(self): print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.UCM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._writeLog( self.RECOMMENDER_NAME + ": Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._writeLog( self.RECOMMENDER_NAME + ": Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.UCM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_users): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if time.time( ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * ( 1 + self.add_zeros_quota): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ( {:.2f} %) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._writeLog( self.RECOMMENDER_NAME + ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._writeLog( self.RECOMMENDER_NAME + ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz))
class User_CFW_D_Similarity_Linalg(BaseUserSimilarityMatrixRecommender): RECOMMENDER_NAME = "User_CFW_D_Similarity_Linalg" def __init__(self, URM_train, UCM, S_matrix_target): super(User_CFW_D_Similarity_Linalg, self).__init__(URM_train) if (URM_train.shape[0] != UCM.shape[0]): raise ValueError( "Number of users not consistent. URM contains {} but UCM contains {}" .format(URM_train.shape[1], UCM.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "User similarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != UCM.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], UCM.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.UCM = check_matrix(UCM, 'csr') self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.UCM.shape[1] self.sparse_weights = True def _writeLog(self, string): print(string) sys.stdout.flush() sys.stderr.flush() if self.logFile is not None: self.logFile.write(string + "\n") self.logFile.flush() def _generateTrainData_low_ram(self): print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.UCM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._writeLog( self.RECOMMENDER_NAME + ": Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._writeLog( self.RECOMMENDER_NAME + ": Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.UCM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_users): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if time.time( ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * ( 1 + self.add_zeros_quota): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ( {:.2f} %) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._writeLog( self.RECOMMENDER_NAME + ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._writeLog( self.RECOMMENDER_NAME + ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz)) def fit(self, show_max_performance=False, logFile=None, loss_tolerance=1e-6, iteration_limit=50000, damp_coeff=0.0, topK=300, add_zeros_quota=0.0, normalize_similarity=False): self.logFile = logFile self.normalize_similarity = normalize_similarity self.add_zeros_quota = add_zeros_quota self.topK = topK self._generateTrainData_low_ram() commonFeatures = self.UCM[self.row_list].multiply( self.UCM[self.col_list]) linalg_result = linalg.lsqr(commonFeatures, self.data_list, show=False, atol=loss_tolerance, btol=loss_tolerance, iter_lim=iteration_limit, damp=damp_coeff) # res = linalg.lsmr(commonFeatures, self.data_list, show = False, atol=loss_tolerance, btol=loss_tolerance, # maxiter = iteration_limit, damp=damp_coeff) self.D_incremental = linalg_result[0].copy() self.D_best = linalg_result[0].copy() self.epochs_best = 0 self.loss = linalg_result[3] self._compute_W_sparse() def _compute_W_sparse(self, use_incremental=False): if use_incremental: feature_weights = self.D_incremental else: feature_weights = self.D_best self.similarity = Compute_Similarity( self.UCM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=feature_weights) self.W_sparse = self.similarity.compute_similarity() self.sparse_weights = True def save_model(self, folder_path, file_name=None): import pickle if file_name is None: file_name = self.RECOMMENDER_NAME print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME, folder_path + file_name)) dictionary_to_save = { "D_best": self.D_best, "topK": self.topK, "sparse_weights": self.sparse_weights, "W_sparse": self.W_sparse, "normalize_similarity": self.normalize_similarity } pickle.dump(dictionary_to_save, open(folder_path + file_name, "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("{}: Saving complete".format(self.RECOMMENDER_NAME))