def RMSE(prediction: np.array, ground_truth: csr_matrix) -> float: """ calculate Root Mean Square Error Params: prediction: predicted matrix ground_truth: real matrix """ logging.getLogger(__name__).debug('RMSE calculating...') prediction = prediction[ground_truth.nonzero()].flatten() logging.getLogger(__name__).debug("Predict: " + str(prediction) + " length:" + str(len(prediction))) ground_truth = ground_truth[ground_truth.nonzero()].A.flatten() logging.getLogger(__name__).debug("Test: " + str(ground_truth) + " length:" + str(len(ground_truth))) ret = sqrt(mean_squared_error(prediction, ground_truth)) logging.getLogger(__name__).info('RSME: ' + str(ret)) return ret
def make_sparse_tensor(x: sparse.csr_matrix): rows, cols = x.nonzero() data = x.data i = torch.LongTensor([rows, cols]) v = torch.FloatTensor(data) res = torch.sparse.FloatTensor(i, v, torch.Size(x.shape)) return res
def _csr_swap_zero_nonzero_in_row(row: sparse.csr_matrix, rng: np.random.Generator, p: float = 0.1) -> sparse.csr_matrix: """ Swap 0 and nonzero values Typically, most values are 0, so we do this for p * num_zero entries This is exact, meaning we never swap fewer or more values """ assert row.shape[0] == 1 nonzero_idx = row.nonzero()[1] arr = row.toarray().squeeze() zero_idx = np.where(arr == 0)[0] # Because # nonzero << # zero, we use # nonzero to determine number of swaps n = int(round(len(nonzero_idx) * p)) # Choose indices to swap zero_idx_swap = rng.choice(zero_idx, n, replace=False) nonzero_idx_swap = rng.choice(nonzero_idx, n, replace=False) # Transfer nonzero values to selected "aero" indices arr[zero_idx_swap] = arr[nonzero_idx_swap] # Zero out the original values at the nonzero indices arr[nonzero_idx_swap] = 0 retval = sparse.csr_matrix(arr) assert retval.shape == row.shape assert len(retval.nonzero()[1]) == len(nonzero_idx) return retval
def calculate_sparse_tf_idf_matrix( term_frequencies_sparse_matrix: csr_matrix, documents_frequencies: Dict[str, int]) -> csr_matrix: """ Считает TF-IDF матрицу на основе матрицы TF и ветора DF :param term_frequencies_sparse_matrix: разреженная TF матрица: матрица размера (число документов, размер словаря), содержащая частоты слов в документах :param documents_frequencies: Словарь (вектор) документных частот терминов :return: Разреженная TF-IDF матрица """ # Узнаём общее число документов и размер словаря num_documents, vocab_size = term_frequencies_sparse_matrix.shape # Создаём пустую разреженную матрицу tf_idf_sparse_matrix = csr_matrix((num_documents, vocab_size), dtype=float) non_empty_row_ids, non_empty_col_ids = term_frequencies_sparse_matrix.nonzero( ) # Итерируемся по ненулевым элементам разреженной матрицы TF for doc_id, token_id in zip(non_empty_row_ids, non_empty_col_ids): # Берём частоту термина в документе из матрицы TF tf = term_frequencies_sparse_matrix[doc_id, token_id] # Считаем инвертированную документную частоту термина (IDF) idf = num_documents / documents_frequencies[token_id] # TF-IDF = TF * log(IDF) tf_log_idf_value = tf * math.log2(idf) tf_idf_sparse_matrix[doc_id, token_id] = tf_log_idf_value return tf_idf_sparse_matrix
def augment_with_user_similarity_best_scores(urm: sps.csr_matrix, similarity, topK, value=0.5, remove_seen=True, users=None): # Create a copy of the urm augmented_urm = urm.tolil(copy=True).astype(np.float) # Compute the score matrix score_matrix = similarity.dot(urm).astype(np.float) # Remove items that has already been seen if remove_seen: indices_seen = urm.nonzero() score_matrix[indices_seen] = float("-inf") # Filtering the data that are not in the users list if users is not None: score_matrix = score_matrix[users] # Find the topK generated interactions top_indices = score_matrix.data.argpartition(-topK)[-topK:] max_k = score_matrix.data[top_indices].min() x = sps.find(score_matrix) user_item_data = zip(x[0], x[1], x[2]) user_item = [(user, item) for user, item, data in user_item_data if data >= max_k] # Insert the best items in the urm matrix for user, item in user_item: augmented_urm[user, item] += value # Return the augmented urm return augmented_urm.tocsr()
def _get_igraph_from_adjacency(adj: csr_matrix, simplify=True): """Get an undirected igraph graph from adjacency matrix. Better than Graph.Adjacency for sparse matrices. Parameters ---------- adj sparse, weighted, symmetrical adjacency matrix. """ sources, targets = adj.nonzero() weights = adj[sources, targets] if isinstance(weights, np.matrix): weights = weights.A1 if isinstance(weights, csr_matrix): # this is the case when len(sources) == len(targets) == 0, see #236 weights = weights.toarray() g = ig.Graph(directed=not simplify) g.add_vertices(adj.shape[0]) # this adds adjacency.shape[0] vertices g.add_edges(list(zip(sources, targets))) g.es["weight"] = weights if g.vcount() != adj.shape[0]: logging.warning( f"The constructed graph has only {g.vcount()} nodes. " "Your adjacency matrix contained redundant nodes.") # type: ignore if simplify: # since we start from a symmetrical matrix, and the graph is undirected, # it is fine to take either of the two edges when simplifying. g.simplify(combine_edges="first") return g
def construct_graph(W: csr_matrix, directed: bool = False, adjust_weights: bool = True) -> "igraph": assert issparse(W) s, t = W.nonzero() w = W.data if not directed: idx = s < t s = s[idx] t = t[idx] w = w[idx] if adjust_weights: w = ((w / np.median(w)) * 100.0 + 0.5).astype(int) / 100.0 # round to 2 decimal points idx = w > 0.0 if idx.sum() < w.size: s = s[idx] t = t[idx] w = w[idx] G = igraph.Graph(directed=directed) G.add_vertices(W.shape[0]) G.add_edges(zip(s, t)) G.es["weight"] = w return G
def _X_to_df(self, X: sps.csr_matrix, user_ids: List[Any]) -> pd.DataFrame: if self.item_ids is None: raise RuntimeError( "Setting item_ids is required to use this method.") X.sort_indices() row, col = X.nonzero() data = X.data return pd.DataFrame( dict( user_id=[user_ids[r] for r in row], item_id=[self.item_ids[c] for c in col], rating=data, ))
def _compute_sparse_gradient(hat_vect_matrix: sparse.csr_matrix, X: sparse.csr_matrix, z: np.ndarray, y: np.ndarray) -> np.ndarray: # grad_z = (hat_vect_matrix.multiply(z @ y.T - X)).sum(axis=1) <-- compressed but memory inefficient (`A`=z @ y.T is dense) implementation # return grad_z.A # sparse matrix are represented by data, rows, cols indices sparse_X_tuple = (X.data, *X.nonzero()) # create difference matrix sparse representation to avoid passing `hat_vect_matrix` as full dense matrix diff_matrix = sparse.csr_matrix( (_compute_sparse_difference_matrix(sparse_X_tuple, z, y)), shape=X.shape, dtype=z.dtype) # print("Norm-check", np.linalg.norm(hat_vect_matrix.multiply(diff_matrix).toarray() - hat_vect_matrix.multiply(z @ y.T - X).toarray() ) ) # sum over rows (axis=1) return hat_vect_matrix.multiply(diff_matrix).sum(axis=1)
def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix: """ Row normalize a csr matrix without mutating the input :param matrix: scipy.sparse.csr_matrix instance """ if not isinstance(matrix, csr_matrix): raise TypeError('expected input to be a scipy csr_matrix') if any(matrix.data == 0): raise ValueError( 'input must be scipy.sparse.csr_matrix and must not store zeros') # get row index for every nonzero element in matrix row_idx, col_idx = matrix.nonzero() # compute unraveled row sums row_sums = matrix.sum(axis=1).A1 # divide data by (broadcasted) row sums normalized = matrix.data / row_sums[row_idx] return csr_matrix((normalized, (row_idx, col_idx)), shape=matrix.shape)
def extreme_multilabel_classification_report( y_true: csr_matrix, y_score: csr_matrix, k_range: Iterable = range(1, 11)) -> dict: """ Unused function to get an overview over prediction results 1. Precision at k 2. DCG at k 3. nDCG at k 4. F1 (macro) score :param y_true: :param y_score: :param k_range: :return: """ # TODO use sklearn function to check dimensions if y_true.shape != y_score.shape: raise Exception('y_true and y_score must have same dimension') # init dict result = dict() result['precision@k'] = {} result['dcg@k'] = {} # precision at k for k in k_range: result['precision@k'][str(k)] = sparse_average_precision_at_k(y_true, y_score, k=k) result['dcg@k'][str(k)] = average_discounted_cumulative_gain_at_k( y_true, y_score, k=k) # TODO nDCG # F1 Macro Average # cast scores to binary matrix binary_pred = lil_matrix(y_score.shape, dtype='int8') binary_pred[y_score.nonzero()] = 1 # binary_pred = binary_pred.tocsr() result['f1_marco'] = f1_score(y_true, binary_pred, average='macro') result[ 'label_ranking_average_precision_score'] = label_ranking_average_precision_score( y_true.toarray(), y_score.toarray()) return result
def rank_nodes(network: sparse.csr_matrix, num_walks=1024, max_walk_length=10): samples = np.random.uniform(0, 1, num_walks) distribution = np.histogram(samples, max_walk_length)[0] sparse_pointers = network.indptr sparse_neighbors = network.indices hashes = [] degree = Counter(network.nonzero()[0]) degree = [degree[i] if i in degree else 0 for i in range(network.shape[0])] for i in range(network.shape[0]): generated_walks = [] # Generate walks for j, num in enumerate(distribution): walk_matrix = -np.ones((num, (j + 2)), dtype=np.uint32, order='C') walk_matrix = np.reshape(walk_matrix, (walk_matrix.size,), order='C') numba_walk_kernel(walk_matrix, i, sparse_pointers, sparse_neighbors, num_steps=j + 1, num_walks=num) wm = walk_matrix.tolist() generated_walks += [np.mean([degree[node] for node in wm[k:k + num]]) for k in range(0, len(wm), num)] hashes.append(np.mean(generated_walks)) return hashes
def _csr_swap_in_row(row: sparse.csr_matrix, rng: np.random.Generator, p: float = 0.1) -> sparse.csr_matrix: """ Helper function for swapping nonzero values in a given row """ assert row.shape[0] == 1, f"Did not get a row!" nonzero_idx = row.nonzero()[1] shuffle_idx = np.arange(len(nonzero_idx)) # Randomly choose a proportion of the nonzero indices to shuffle n = int(round(len(shuffle_idx) * p)) swap_idx = nonzero_idx[rng.choice(shuffle_idx, size=n, replace=False)] # Shuffle the indices we chose above dest_idx = rng.choice(swap_idx, size=len(swap_idx), replace=False) assert swap_idx.shape == dest_idx.shape arr = row.toarray().squeeze() assert np.all(arr[swap_idx] != 0) arr[dest_idx] = arr[swap_idx] retval = sparse.csr_matrix(arr) return retval
def sparse_mat_get_rmse(u_mat: ss.csr_matrix, v_mat: ss.csr_matrix, user_preference: ss.csr_matrix, show_process: bool = True) -> np.float64: """ 稀疏矩阵情况下计算 RMSE :param u_mat: U :param v_mat: V :param user_preference: 用户偏好矩阵 :param show_process: 是否显示计算进度 :return: RMSE """ non_zero = user_preference.nonzero() residue = 0 total = non_zero[0].size for i in range(non_zero[0].size): if show_process: print('step', i, 'of', total) conducted = u_mat[non_zero[0][i], :].dot(v_mat[:, non_zero[1][i]]) user_conducted = user_preference[non_zero[0][i], non_zero[1][i]] # print("user_conducted", user_conducted, "conducted", conducted) residue_each_element = user_conducted - conducted[0, 0] residue += residue_each_element ** 2 return np.sqrt(residue / np.size(user_preference))
def to_csv(self, filename: str, X: sparse.csr_matrix): """ Dump csr sparse matrix to csv file restoring original MovieLens format. Args: filename (str): X (scipy.sparse.csr_matrix): Matrix of ratings to dump. """ data, rows, cols = X.data, *X.nonzero() with open(filename, mode='w') as file: file_matrix = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) file_matrix.writerow(['UserId', 'MovieId', 'Rating']) for rating, user_id, movie_id in zip(data, rows, cols): # user_id start from 1 in movielens user_id += 1 # restore ratings to their original scale rating = self._rescale_back_rating(rating) # restore movie id to MovieLens system movie_id = self.inverse_movie_map[movie_id] file_matrix.writerow([user_id, movie_id, rating])
def _iter_meta(ids: ndarray, meta: csr_matrix, n_dim: int) -> Iterator[List[int]]: """ Lazily evaluate metadata in the provided CSR matrix. Parameters ---------- ids: ndarray An array of IDs. For items, this will correspond to individual item IDs. For users, this will correspond to individual user IDs. meta: csr_matrix A sparse matrix of (NxM) dimensions, where N corresponds to the number of user/item IDs (above) and M corresponds to the number of user/item metadata features (vocab) in the dataset. n_dim: int The length of the output vectors. Makes sure this is large enough to actually append some metadata to your output vectors (i.e. > 1). Returns ------- output: Iterator An iterator, where each ID in the list is mapped to corresponding metadata. The output shape of each element is then a list of 'n_dim' length. """ groups = defaultdict(list) _ids, tags = meta.nonzero() for _id, _tag in zip(_ids, tags): groups[_id].append(_tag) for _id in ids: group = groups[_id] padding = [0] * max(0, n_dim - len(group)) features = [_id, *group, *padding][:n_dim] yield features
def normalize_vectors(mx: sparse.csr_matrix, axis: int) -> sparse.csr_matrix: """Performs normalization of vectors (i.e. divide each vector by its corresponding Euclidean norm). Parameter `axis` can be 0 (column-vectors) or 1 (row-vectors) :param mx: sparse matrix :param axis: 0 or 1 :return: sparse matrix """ if axis not in {0, 1}: raise ValueError('Axis must be either 0 or 1.') mx = mx.copy().astype(np.float64) mx_norms = mx.copy() mx_norms.data **= 2 mx_norms = mx_norms.sum(axis=axis).A.flatten()**0.5 mx_norms = mx_norms[mx.nonzero()[1 - axis]] mx.data /= mx_norms return mx
def __save_to_docword_file(self, bag_of_words: csr_matrix, issues: List[TokenizedIssue], target_dir: str) -> None: """ Save words to docword file in following format: D (documents number) W (words number) NNZ (total rows) docID wordID count docID wordID count ..... :param bag_of_words: Matrix where each cell represents number of word appearance in document :param issues: Tokenized issues :param target_dir: Target directory where docword file will be created :return: None """ target_path = os.path.join(target_dir, "docword.issues.txt") with open(target_path, "w") as docword_file: docword_file.write(str(len(issues)) + "\n") docword_file.write(str(len(self.count_vectorizer.get_feature_names())) + "\n") docword_file.write(str(bag_of_words.nnz) + "\n") nnz_x, nnz_y = bag_of_words.nonzero() for x, y in zip(nnz_x, nnz_y): docword_file.write( "%s %s %s\n" % (str(issues[x].id), str(y + 1), str(bag_of_words[x, y])))
def csr_to_dicts(x:csr_matrix,dim_names=None): if dim_names is None: dim_names = [i for i in range(x.shape[1])] vert_idx,horiz_idx = x.nonzero() return [{dim_names[k]:v for k,v in zip(horiz_idx[np.where(vert_idx==row_idx)],x.data[np.where(vert_idx==row_idx)])} for row_idx in range(x.shape[0])]
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, min_nnz=0, verbose: int = 0, log=None, n_jobs=None): """MP empiric for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_empiric() """ if verbose and log: log.message("Starting MP empiric for sparse matrices.") self_value = 1. # similarity matrix n = S.shape[0] if not n_jobs: n_jobs = 1 elif n_jobs == -1: n_jobs = cpu_count() else: pass # This will become S_mp.data shared_data = Array(ctypes.c_double, S.data.size) shared_data_np = np.ctypeslib.as_array(shared_data.get_obj()) if verbose and log: log.message("Spawning processes and starting MP computation.") with Pool(processes=n_jobs, initializer=_mpes_init, initargs=(S, shared_data)) as pool: S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero())) for _ in pool.imap(func=partial(_mpes_sec_dist, args=(verbose, log, n, min_nnz)), iterable=S_nonzero, chunksize=int(1e5)): pass # output stored by function in shared array pool.join() if verbose and log: log.message("Assemble upper-triangular MP matrix.") S_mp = csr_matrix((shared_data_np, S.indices, S.indptr), shape=S.shape, copy=False).tolil() del shared_data, shared_data_np if verbose and log: log.message("Symmetrizing matrix.") S_mp += S_mp.T # Retain original distances for objects with too few neighbors. # That is, keep distances FROM these objects to others (rows), but # set distances of other objects TO them to NaN (columns). # Returned matrix is thus NOT SYMMETRIC. if verbose and log: log.message(("Retain original similarities for objects with too few " "neighbors. If there are any, the output matrix will " "not be symmetric anymore! (Rows corresponding to these " "objects will be in original space; corresponding " "columns will contain NaN).")) for row in np.argwhere(S.getnnz(axis=1) <= min_nnz): row = row[0] # use scalar for indexing instead of array S_mp[row, :] = S.getrow(row) if verbose and log: log.message("Setting self similarities.") for i in range(n): S_mp[i, i] = self_value #need to set self values if verbose and log: log.message("Converting to CSR matrix and returning.") return S_mp.tocsr()
def _getPossiblePositiveEdgeIdxs(self, mtx: sp.csr_matrix) -> np.array: nonzeroTpl = mtx.nonzero() return np.dstack([nonzeroTpl[0], nonzeroTpl[1]]).reshape(-1, 2)
def X_to_df(X: sps.csr_matrix, uids: np.ndarray) -> pd.DataFrame: rows, cols = X.nonzero() return pd.DataFrame( dict(user_id=[uids[row] for row in rows], item_id=unique_item_ids[cols]) )
def X_to_df(X: sps.csr_matrix, uids: List[Any]) -> pd.DataFrame: rows, cols = X.nonzero() return pd.DataFrame( dict(user_id=[uids[row] for row in rows], item_id=item_id_reprod[cols]) )