def fit(self, topK=350, shrink=10, sim_type='splus'): m = similarity(self.icm.T, k=topK, sim_type=sim_type, shrink=shrink) m = self._check_matrix(m, format='csr') self.sim_matrix = normalize(m, norm='l2', axis=0) self.r_hat = self.urm.dot(self.sim_matrix)
def _compute_W_sparse(self, use_incremental=False): if use_incremental: feature_weights = self.D_incremental else: feature_weights = self.D_best self.W_sparse = similarity(self.ICM.T, shrink=0, k=self.topK) self.sparse_weights = True
def fit(self, topK=210, shrink=460, sim_type='jaccard'): self.topK = topK self.shrink = shrink m = similarity(self.urm, k=topK, sim_type=sim_type, shrink=shrink) m = self._check_matrix(m, format='csr') self.sim_matrix = normalize(m, norm='l2', axis=1) self.r_hat = self.urm.dot(self.sim_matrix)
def fit(self, topK=110, shrink=350, sim_type='cosine'): self.topK = topK self.shrink = shrink m = similarity(self.urm.T, k=topK, sim_type=sim_type, shrink=shrink) m = self._check_matrix(m, format='csr') self.sim_matrix = normalize(m, norm='l2', axis=1) self.r_hat = self.sim_matrix.dot(self.urm)
def fit(self, topK=None, l2_norm=1e3, normalize_matrix=False): start_time = time.time() print("|{}| Fitting model... |".format(self.NAME)) if normalize_matrix: # Normalize rows and then columns self.URM_train = normalize(self.URM_train, norm='l2', axis=1) self.URM_train = normalize(self.URM_train, norm='l2', axis=0) self.URM_train = sps.csr_matrix(self.URM_train) # Grahm matrix is X X^t, compute dot product grahm_matrix = similarity(self.URM_train, shrink=0, k=self.URM_train.shape[1], sim_type="cosine").toarray() diag_indices = np.diag_indices(grahm_matrix.shape[0]) grahm_matrix[diag_indices] += l2_norm P = np.linalg.inv(grahm_matrix) B = P / (-np.diag(P)) B[diag_indices] = 0.0 print("Fitting model... done in {:.2f}".format(time.time() - start_time)) # Check if the matrix should be saved in a sparse or dense format # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota % if topK is not None: B = self._similarity_matrix_topk(B, k=topK, verbose=False) if self._is_content_sparse_check(B): self._print("Detected model matrix to be sparse, changing format.") self.W_sparse = self._check_matrix(B, format='csr', dtype=np.float32) else: self.W_sparse = self._check_matrix(B, format='npy', dtype=np.float32) self._W_sparse_format_checked = True self.r_hat = self.URM_train.dot(self.W_sparse)
scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER) print(cv_scores) print(scores) SIM_THRESH = .965 MODEL = "tfidf_w2v_top5w" # embedding(model=MODEL, data_prefix=DATA_PREFIX) # similarity(model=MODEL, thresh=SIM_THRESH) # classifier(model=MODEL, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER) scores[0, :] = test(MODEL, data_prefix=DATA_PREFIX, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER) MODEL = "tfidf_w2v_top5a" embedding(model=MODEL, data_prefix=DATA_PREFIX) similarity(model=MODEL, thresh=SIM_THRESH) classifier(model=MODEL, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER) scores[1, :] = test(MODEL, data_prefix=DATA_PREFIX, scoring=SCORE, n_FAQs=NFAQS, pre=PRECLASSIFIER) print(scores) # TODO: mirror test with validate
def _generateTrainData_low_ram(self): print(self.NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure S_matrix_contentKNN = similarity(self.ICM.T, shrink=0, k=self.topK) S_matrix_contentKNN = self._check_matrix(S_matrix_contentKNN, "csr") self._writeLog( self.NAME + ": Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._writeLog( self.NAME + ": Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.ICM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if time.time( ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * ( 1 + self.add_zeros_quota): print(self.NAME + ": Generating train data. Sample {} ( {:.2f} %) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._writeLog( self.NAME + ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._writeLog( self.NAME + ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz))