def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[ nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[ nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def __init__(self, URM_train, sparse_weights=True): super(ItemKNNCFRecommender, self).__init__() # CSR is faster during evaluation self.URM_train = cm.check_matrix(URM_train, 'csr') self.dataset = None self.sparse_weights = sparse_weights
def __init__(self, ICM, URM_train, sparse_weights=True): super(ItemKNNCBFRecommender, self).__init__() self.ICM = ICM.copy() # CSR is faster during evaluation self.URM_train = cm.check_matrix(URM_train.copy(), 'csr') self.sparse_weights = sparse_weights
def fit(self, R): self.dataset = R R = cm.check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, self.rnd_seed) # precompute the user factors M = R.shape[0] self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)])
def __init__(self, URM_train, sparse_weights=True): super(UserKNNCFRecommender, self).__init__() self.name = 'UserKNN' # Not sure if CSR here is faster self.URM_train = cm.check_matrix(URM_train, 'csr') self.dataset = None self.sparse_weights = sparse_weights self.compute_item_score = self.compute_score_user_based
def __init__(self, urm_filter_tracks): self.urm_filter_tracks = urm_filter_tracks index = 0 for r in self.matrices_array: self.matrices_array[index] = cm.check_matrix(r, 'csr') index += 1 self._normalization(normalization_mode=self.normalization_mode) print('matrices_normalized')
def save_r_hat(self, evaluation): r_hat = self.W_sparse r_hat = check_matrix(r_hat, format='csr') # create dir if not exists if evaluation: filename = 'raw_data/saved_r_hat_evaluation/{}_{}'.format(self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) else: filename = 'raw_data/saved_r_hat/{}_{}'.format(self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) sps.save_npz(filename, r_hat) log.success('R_hat succesfully saved in: {}.npz'.format(filename))
def save_r_hat(self, evaluation=False): r_hat = self.get_r_hat() r_hat = check_matrix(r_hat, format='csr') # create dir if not exists if evaluation: filename = 'raw_data/saved_r_hat_evaluation/{}_{}'.format( self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) else: filename = 'raw_data/saved_r_hat/{}_{}'.format( self.name, time.strftime('%H-%M-%S')) os.makedirs(os.path.dirname(filename), exist_ok=True) sps.save_npz(filename, r_hat)
def apply_top_k(matrix, k): start = time.time() matrix = cm.check_matrix(matrix, format='csr') # initializing the new row to substitue to the preciding one filtered_matrix = np.empty(shape=(matrix.shape[0], matrix.shape[1])) for i in range(matrix.shape[0]): row = matrix.getrow(i) row = row.todense() relevant_items_row_indices = (-row).argpartition(k)[0, 0:k] for c_index in relevant_items_row_indices[0]: filtered_matrix[i, c_index] = row[0, c_index] #convert the matrix to a sparse format sp_filtered_matrix = sps.csr_matrix(filtered_matrix) print('topK applied in {} s'.format(time.time() - start)) return sp_filtered_matrix
def __init__(self, name, cluster, mode, matrices_array, normalization_mode, weights_array): super(Hybrid, self).__init__(name=name, cluster=cluster, mode=mode) # load handle and dictionary based on mode will be used during the recommend batch self.dict_col = data.dictionary_col(mode=self.mode) self.df_handle = data.handle_df(mode=self.mode) self.targetids = data.target_urm_rows(self.mode) self.r_hat = None # will be set if the hybrid is done via similarity matrices self.urm_name = None self.weights_array = weights_array # store the array of matrices in the hybrid recommender self.matrices_array = matrices_array # check the shapes of the matrices self._check_matrices_array_shapes() # normalize the matrices self.normalization_mode = normalization_mode # will be filled when the _normalization method will be called self.normalized_matrices_array = None print( 'checking that all the matrix in matrices array are in CSR format...\n' ) for index in range(len(self.matrices_array)): self.matrices_array[index] = cm.check_matrix( self.matrices_array[index], 'csr') print('done\n') print('normalizing the matrix in matrices array...\n') self._normalization(normalization_mode=self.normalization_mode) print('matrices_normalized\n')
def fit(self, R): ''' Initialize the model :param num_factors: number of latent factors :param lrate: initial learning rate used in SGD :param user_reg: regularization for the user factors :param pos_reg: regularization for the factors of the positive sampled items :param neg_reg: regularization for the factors of the negative sampled items :param iters: number of iterations in training the model with SGD :param sampling_type: type of sampling. Supported types are 'user_uniform_item_uniform' and 'user_uniform_item_pop' :param sample_with_replacement: `True` to sample positive items with replacement (doesn't work with 'user_uniform_item_pop') :param use_resampling: `True` to resample at each iteration during training :param sampling_pop_alpha: float smoothing factor for popularity based samplers (e.g., 'user_uniform_item_pop') :param init_mean: mean used to initialize the latent factors :param init_std: standard deviation used to initialize the latent factors :param lrate_decay: learning rate decay :param rnd_seed: random seed :param verbose: controls verbosity in output ''' self.dataset = R R = cm.check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = BPRMF_sgd(R, num_factors=100, lrate=0.1, user_reg=0.0015, pos_reg=0.0015, neg_reg=0.0015, iters=10, sampling_type='user_uniform_item_uniform', sample_with_replacement=True, use_resampling=True, sampling_pop_alpha=1.0, init_mean=0.0, init_std=0.1, lrate_decay=1.0, rnd_seed=42, verbose=True)
def fit(self): print('hybrid matrix creation...') start = time.time() hybrid_matrix = sps.csr_matrix(self.normalized_matrices_array[0].shape) count = 0 for m in self.normalized_matrices_array: hybrid_matrix += m * self.weights_array[count] count += 1 if self.name == 'HybridSimilarity': # compute the r_hat if we have the similarity urm = data.urm(self.mode, self.urm_name) # check that urm is in CSR format urm = cm.check_matrix(urm, 'csr') # check if the similarity is user-user or item-item if hybrid_matrix.shape[0] == urm.shape[1]: # user - user similarity hybrid_matrix = urm[self.targetids].dot(hybrid_matrix) else: # item - item similarity hybrid_matrix = hybrid_matrix[self.targetids].dot(urm) print('hybrid matrix created in {:.2f} s'.format(time.time() - start)) self.r_hat = hybrid_matrix
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the Similarity_MFD for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: # Add previous block size processedItems += this_block_size end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time) print( "Similarity_MFD column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() if self.use_row_weights: # item_data = np.multiply(item_data, self.row_weights) # item_data = item_data.T.dot(self.row_weights_diag).T this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \ (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink # this_column_weights = this_column_weights.toarray().ravel() if self.TopK == 0: self.W_dense[:, columnIndex] = this_column_weights else: # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) start_col_block += block_size # End while on columns if self.TopK == 0: return self.W_dense else: W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse
def __init__(self, URM_train): super(RP3betaRecommender, self).__init__() self.URM_train = cm.check_matrix(URM_train, format='csr', dtype=np.float32) self.sparse_weights = True