def awesome_cossim_top(A, B, ntop, lower_bound=0): # force A and B as a CSR matrix. # If they have already been CSR, there is no overhead A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M*ntop indptr = np.zeros(M+1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) return csr_matrix((data,indices,indptr),shape=(M,N))
def awesome_cossim_top(A, B, ntop, lower_bound=0): A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M*ntop indptr = np.zeros(M+1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) return csr_matrix((data,indices,indptr),shape=(M,N))
def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): """ This function will return a matrxi C in CSR format, where C = [sorted top n results and results > lower_bound for each row of A * B] Input: A and B: two CSR matrix ntop: n top results lower_bound: a threshold that the element of A*B must greater than use_threads: use multi-thread or not n_jobs: number of thread, must be >= 1 Output: C: result matrix N.B. if A and B are not CSR format, they will be converted to CSR """ if not isspmatrix_csr(A): A = A.tocsr() if not isspmatrix_csr(B): B = B.tocsr() M, K1 = A.shape K2, N = B.shape idx_dtype = np.int32 nnz_max = M * ntop indptr = np.empty(M + 1, dtype=idx_dtype) indices = np.empty(nnz_max, dtype=idx_dtype) data = np.empty(nnz_max, dtype=A.dtype) if not use_threads: ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) else: if n_jobs < 1: err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' raise ValueError(err_str) ct_thread.sparse_dot_topn_threaded( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data, n_jobs) return csr_matrix((data, indices, indptr), shape=(M, N))
def cossim_top(A, B, ntop, lower_bound=0): try: import sparse_dot_topn.sparse_dot_topn as ct except ModuleNotFoundError: print("This module requires the sparse_dot_topn library \ accelerated sparse matrix multiplication,which can be found \ at https://github.com/ing-bank/sparse_dot_topn.") import sys sys.exit(1) B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M * ntop indptr = np.empty(M + 1, dtype=idx_dtype) indices = np.empty(nnz_max, dtype=idx_dtype) data = np.empty(nnz_max, dtype=A.dtype) ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) return csr_matrix((data, indices, indptr), shape=(M, N))
def _awesome_cossim_top(self, ntop, lower_bound): ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py ''' # To CSR Matrix, if needed A = self.tfidf_vect.fit_transform(self.source_names).tocsr() B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M * ntop indptr = np.zeros(M+1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
def awesome_cossim_top(A, B, ntop, pFromDir, pToDir, lower_bound=0): try: # force A and B as a CSR matrix. # If they have already been CSR, there is no overhead A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M*ntop indptr = np.zeros(M+1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) except Exception as e: print('*** ERROR[001]: Error in similarity calculating matrix: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) return csr_matrix((data,indices,indptr),shape=(M,N))
def _awesome_cossim_top(self, A, B, ntop): """ True magic, an improvment on scikit-learn's cosine_similarity function. Thanks to ING BANK. ING definition: This function will return a matrix C in CSR format, where C = [sorted top n results and results > lower_bound for each row of A * B] Input: A and B: two CSR matrix ntop: n top results self.lowest_similarity: a threshold that the element of A*B must greater than Output: C: result matrix N.B. if A and B are not CSR format, they will be converted to CSR """ if not isspmatrix_csr(A): A = A.tocsr() if not isspmatrix_csr(B): B = B.tocsr() M, K1 = A.shape K2, N = B.shape if K1 != K2: err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' raise ValueError(err_str) idx_dtype = np.int32 nnz_max = M * ntop # basic check. if A or B are all zeros matrix, return all zero matrix directly if len(A.indices) == 0 or len(B.indices) == 0: indptr = np.zeros(M + 1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) return csr_matrix((data, indices, indptr), shape=(M, N)) # filled matrices from here on indptr = np.zeros(M + 1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, self.lowest_similarity, indptr, indices, data) return csr_matrix((data, indices, indptr), shape=(M, N))
def awesome_cossim_topn(A, B, ntop, lower_bound=0): """ This function will return a matrxi C in CSR format, where C = [sorted top n results and results > lower_bound for each row of A * B] Input: A and B: two CSR matrix ntop: n top results lower_bound: a threshold that the element of A*B must greater than Output: C: result matrix N.B. if A and B are not CSR format, they will be converted to CSR """ if not isspmatrix_csr(A): A = A.tocsr() if not isspmatrix_csr(B): B = B.tocsr() M, K1 = A.shape K2, N = B.shape idx_dtype = np.int32 nnz_max = M*ntop indptr = np.empty(M+1, dtype=idx_dtype) indices = np.empty(nnz_max, dtype=idx_dtype) data = np.empty(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) return csr_matrix((data,indices,indptr),shape=(M,N))
def awesome_cossim_top(A, B, ntop, lower_bound=0): """ Evaluates similarity score between two groups of strings (as matrix) with cosine similarity and prints ntop highest values per string Parameters ---------- A,B : matrix matrix representation of strings to compare ntop : int Number of coincidences wanted printed in results Returns ------- csr matrix a sparse matrix with the ntop highest coincidences """ # force A and B as a CSR matrix. A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M * ntop indptr = np.zeros(M + 1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) return csr_matrix((data, indices, indptr), shape=(M, N))
def cosimtop(A, B, ntop, lower_bound=0): ''' Optimized cosine similarity computation. :param A: First matrix. :param B: Second matrix. :param ntop: Top n for each row. :param lower_bound: Lower bound for each row. :return: Cosine similarity matrix. ''' A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M * ntop indptr = np.zeros(M + 1, dtype = idx_dtype) indices = np.zeros(nnz_max, dtype = idx_dtype) data = np.zeros(nnz_max, dtype = A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype = idx_dtype), np.asarray(A.indices, dtype = idx_dtype), A.data, np.asarray(B.indptr, dtype = idx_dtype), np.asarray(B.indices, dtype = idx_dtype), B.data, ntop, lower_bound, indptr, indices, data ) return csr_matrix( (data, indices, indptr), shape=(M, N) )