def awesome_cossim_topn_wrapper(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, return_best_ntop=False, test_nnz_max=-1, expect_best_ntop=None): """ This function is running awesome_cossim_topn() with and without return_best_ntop and checking if we get the expected result and if both results are the same. It has the same signature, but has an extra parameter: expect_best_ntop """ result1, best_ntop = awesome_cossim_topn(A, B, ntop, lower_bound, use_threads, n_jobs, True, test_nnz_max) assert expect_best_ntop == best_ntop result2 = awesome_cossim_topn(A, B, ntop, lower_bound, use_threads, n_jobs, False, test_nnz_max) assert (result1 != result2).nnz == 0 # The 2 CSR matrix are the same return result1
def helper_awesome_cossim_topn_dense(a_dense, b_dense): dense_result = np.dot(a_dense, np.transpose(b_dense)) # dot product sparse_result = csr_matrix(dense_result) sparse_result_top3 = [ get_n_top_sparse(row, NUM_CANDIDATES) for row in sparse_result ] # get ntop using the old method pruned_dense_result = dense_result.copy() pruned_dense_result[ pruned_dense_result < PRUNE_THRESHOLD] = 0 # prune low similarity pruned_sparse_result = csr_matrix(pruned_dense_result) pruned_sparse_result_top3 = [ get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result ] a_csr = csr_matrix(a_dense) b_csr_t = csr_matrix(b_dense).T awesome_result = awesome_cossim_topn(a_csr, b_csr_t, len(b_dense), 0.0) awesome_result_top3 = awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0) awesome_result_top3 = [ list(zip(row.indices, row.data)) if len(row.data) > 0 else None for row in awesome_result_top3 ] # make comparable, normally not needed pruned_awesome_result = awesome_cossim_topn(a_csr, b_csr_t, len(b_dense), PRUNE_THRESHOLD) pruned_awesome_result_top3 = awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD) pruned_awesome_result_top3 = [ list(zip(row.indices, row.data)) if len(row.data) > 0 else None for row in pruned_awesome_result_top3 ] # no candidate selection, no pruning assert awesome_result.nnz == sparse_result.nnz # no candidate selection, below PRUNE_THRESHOLD similarity pruned assert pruned_awesome_result.nnz == pruned_sparse_result.nnz all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all( pd.isnull(sparse_result_top3)) all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all( pd.isnull(pruned_sparse_result_top3)) # top NUM_CANDIDATES candidates selected, no pruning if not all_none1: np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3) else: assert len(awesome_result_top3) == len(sparse_result_top3) # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned if not all_none2: np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3) else: len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
def run(self, indices): if (len(indices) <= 5): return super().run(indices) df = self.df.iloc[indices] try: # name vectorization tfidf_vectorizer = TfidfVectorizer( ngram_range=self.ngram_range, max_df=self.max_name_vectorization_word_frequency, min_df=self.min_name_vectorization_word_frequency, token_pattern='(\S+)') tf_idf_matrix = tfidf_vectorizer.fit_transform( df.standardized_name) except: # relax constraints and try again tfidf_vectorizer = TfidfVectorizer(ngram_range=self.ngram_range, max_df=1.0, min_df=0, token_pattern='(\S+)') tf_idf_matrix = tfidf_vectorizer.fit_transform( df.standardized_name) sparse_matrix = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), self.topn_by_cosine_similarity, self.min_cosine_similarity) non_zeros = sparse_matrix.nonzero() sparserows = non_zeros[0] sparsecols = non_zeros[1] top_n_rows = min(self.topn_matches_to_apply_model_to, sparsecols.size) left = list(itemgetter(*sparserows[:top_n_rows])(indices)) right = list(itemgetter(*sparsecols[:top_n_rows])(indices)) return super().filter_bydistance(np.dstack((left, right))[0])
def _get_cosine_matrix(self, vals): tf_idf_matrix = self._get_tf_idf_matrix(vals) if self._topn is None: topn = vals.size else: topn = self._topn return awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), topn, self._match_threshold)
def cosine_similarity_on_rows_numpy_and_csr_2(matrix, ntop=10): """ Cosine similarity for numpy arrays and sparse matrices. It use Dask as well as Cython in the back, being faster than the others. Input: - matrix (np.array/sparse): matrix to compute the cosine distance in the rows Output: - matrix (np.array): cosine distance matrix """ mat = matrix.astype(np.float, copy=True) return awesome_cossim_topn(A=mat, B=mat.transpose(), ntop=ntop)
def get_similarity(matrix1,matrix2, n_top, min_similarity, zero_diagonal = False): matrix1 = normalize(matrix1) matrix2 = normalize(matrix2) matrix1 = csr_matrix(matrix1).astype(float) matrix2 = csr_matrix(matrix2).astype(float) similarity_matrix = awesome_cossim_topn(matrix1, matrix2.T, ntop=n_top, lower_bound=min_similarity) # set diagonal to zero if zero_diagonal == True: similarity_matrix.setdiag(0) return similarity_matrix
def train_knn(args): df = pd.read_csv(args.input, names=['query', 'category']) logger.info("Lemmatizing and preparing data for KNN model") df['query_lem'] = df['query'].apply(lemmatize) df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.category.values, random_state=42) logger.info("Fitting vectorizer") vectorizer = TfidfVectorizer() train_features = vectorizer.fit_transform(df_train.query_lem.values) logger.info("Scoring 20% holdout sample") test_features = vectorizer.transform(df_test.query_lem.values) matches = awesome_cossim_topn(test_features, train_features.transpose(), 20, 0.01) ind = np.argwhere(matches) i1 = ind[:, 0] i2 = ind[:, 1] df_test.reset_index(inplace=True) df_test = df_test.rename(columns={'index': 'index1'}) index1 = np.take(df_test['index1'].values, i1) categories = np.take(df_train.category.values, i2) df2 = pd.DataFrame(data={'index1': index1, 'cat': categories}) # most frequent category pred = df2[['index1', 'cat' ]].groupby(['index1']).agg(lambda x: scipy.stats.mode(x)[0]) pred = pred.reset_index() pred = pred.sort_values('index1') pred1 = pred.merge(df_test[['index1', 'category' ]].rename(columns={'category': 'cat_true'}), on='index1', how='left') pred1['match'] = (pred1.cat == pred1.cat_true).astype(int) logger.info("Hold out sample accuracy %s", np.round(pred1['match'].mean(), 2)) logger.info( "Hold out sample f1 score %s", np.round( f1_score(pred1.cat_true.values, pred1.cat.values, average='weighted'), 2)) logger.info( "Hold out sample sklearn accuracy %s", np.round(accuracy_score(pred1.cat_true.values, pred1.cat.values), 2))
def subgroup_match(subdf): if not df2groups.indices.get(subdf.name, None) is None: sub_tfidf1 = tfidf1[subdf.index.values] sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]] co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo() # 2) now use the Levenshtein distance to find the best match for row in set(co.row): rowcol = co.col[co.row==row] argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx], df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx]) if lev_dist >= lev_lower_bound: matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]]
def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: """Builds the cossine similarity matrix of two csr matrices""" tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() optional_kwargs = dict() if self._config.number_of_processes > 1: optional_kwargs = { 'use_threads': True, 'n_jobs': self._config.number_of_processes } return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2, self._config.max_n_matches, self._config.min_similarity, **optional_kwargs)
def identify_anisong_tf(title, artist=None): #fuzzy match title with song titles from database using cosine similarity over TF-IDF matrix # if artist is not None: pass else: tf_idf_matrix_test = vectorizer.transform([title]) matches = awesome_cossim_topn(tf_idf_matrix_test, tf_idf_matrix_songs.transpose(), 1, 0) song2 = songs[matches.nonzero()[1][0]] confidence = int(matches.data[0] * 100) #print(confidence) #print(song2) return conn.execute( f"select anime,type,start_ep,end_ep from anisong where rowid = {matches.nonzero()[1][0]}" ).fetchone() + (confidence, )
def get_fuzzy_matches(words, targets, n=2, lower_bound=0.85): """ Fuzzy matching of single-token lexicon entries using TF-IDF matrices of character-level n-grams. :param words: list of tokens to find fuzzy matches in :param targets: list of targets to match to :param n: number of characters in the n-grams :param lower_bound: lower bound for cosine similarity :return: list of (fuzzy match, matched target, cosine similarity) tuples """ vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(n, n), use_idf=False) vectorizer.fit(words + targets) tf_idf_words = vectorizer.transform(words) tf_idf_lexicon = vectorizer.transform(targets).transpose() matches = awesome_cossim_topn(tf_idf_words, tf_idf_lexicon, ntop=1, lower_bound=lower_bound).tocoo() return [(words[row], targets[col], val) for row, col, val in zip(matches.row, matches.col, matches.data)]
def preprocess_text(self, df_clean): self.text_to_preprocess = self.remove_unneeded_text() df_dirty = {"name": self.text_to_preprocess.split()} vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) tf_idf_matrix_clean = vectorizer.fit_transform(df_clean['name']) tf_idf_matrix_dirty = vectorizer.transform(df_dirty['name']) matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0) matches_df = get_matches_df(matches, df_dirty['name'], df_clean['name'], top=0) matches_df = matches_df.loc[matches_df['similarity'] >= 0.85] for index, row in matches_df.iterrows(): if row["wrong_word"] != row["right_word"]: if self.word_filter(row["right_word"]): if row["similarity"] > 0.95: self.text_to_preprocess = self.text_to_preprocess.replace(row["wrong_word"], row["right_word"]) else: self.text_to_preprocess = self.text_to_preprocess.replace(row["wrong_word"], row["right_word"]) return self.self_written_preprocess_rules()
def score_knn(args): df_test = pd.read_csv(args.input, names=['query']) df_train = pd.read_csv('./data/trainSet.csv', names=['query', 'category']) df_test['query_lem'] = df_test['query'].apply(lemmatize) df_train['query_lem'] = df_train['query'].apply(lemmatize) vectorizer = TfidfVectorizer() train_features = vectorizer.fit_transform(df_train.query_lem.values) test_features = vectorizer.transform(df_test.query_lem.values) matches = awesome_cossim_topn(test_features, train_features.transpose(), 20, 0.01) ind = np.argwhere(matches) i1 = ind[:, 0] i2 = ind[:, 1] df_test.reset_index(inplace=True) df_test = df_test.rename(columns={'index': 'index1'}) index1 = np.take(df_test['index1'].values, i1) query = np.take(df_test['query'].values, i1) categories = np.take(df_train.category.values, i2) df2 = pd.DataFrame(data={ 'query': query, 'index1': index1, 'cat': categories }) # most frequent category pred = df2[['query', 'index1', 'cat']].groupby(['query', 'index1' ]).agg(lambda x: scipy.stats.mode(x)[0]) pred = pred.reset_index() pred = pred.sort_values('index1') pred[['query', 'cat']].to_csv('./data/pred_knn.csv', index=None, header=False) logger.info("Finished scoring test sample with KNN")
def match_to_fda(dest_path: str, to_compare: str): print("Loading FDA dictionary vectorized data...") fda = load_npz("../output/fda_dict_vectorized.npz") print("Generating CSR matrix with matches...") # cossim_matrix = load_npz(to_compare) vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) with open(file="../data/s1_drug_name_list_unique.csv") as csvfile: for name in csvfile: if name.__contains__("drug_name"): continue v_name = vectorizer.fit_transform(list(name)) best_match = awesome_cossim_topn(v_name.reshape(fda.shape), fda, 1, 0.85, use_threads=True, n_jobs=8) print(best_match) break
def generate_csr_matrix(src_path: str, dest_path: str, topn=10, lower_bound=0.85): print("Reading data...") name_list: pd.DataFrame = pd.read_csv(src_path) name_list: pd.Series = name_list.iloc[:, 0] name_list: list = name_list.tolist() print("Vectorizing...") vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) tf_idf_matrix = vectorizer.fit_transform(name_list) print("Generating CSR matrix with matches...") matches: csr_matrix = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), topn, lower_bound, use_threads=True, n_jobs=8) print("Saving CSR matrix...") save_npz(file=dest_path, matrix=matches)
import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import rand from sparse_dot_topn import awesome_cossim_topn N = 10 a = rand(100, 1000000, density=0.005, format='csr') b = rand(1000000, 200, density=0.005, format='csr') # Use standard implementation c = awesome_cossim_topn(a, b, 5, 0.01) # Use parallel implementation with 4 threads d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4)
def awesome_cossim_topn_7_threads(a, b, N, thresh): return awesome_cossim_topn(a, b, N, thresh, True, 7, True)
def awesome_cossim_topn_1_thread(a, b, N, thresh): return awesome_cossim_topn(a, b, N, thresh, True, 1, True)
def awesome_cossim_topn_0_threads(a, b, N, thresh): return awesome_cossim_topn(a, b, N, thresh, False, 1, True)
a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) a = a_sparse.tocsr() # a = a.astype(np.float32) row = rng1.randint(n_duplicates, size=nnz_b) cols = rng1.randint(nr_vocab, size=nnz_b) data = rng1.rand(nnz_b) b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) b = b_sparse.T.tocsr() # b = b.astype(np.float32) # first run without profiling to bring the memory up to the same level # for all subsequent profiled runs: C, C_ntop = awesome_cossim_topn(a, b, N, thresh, True, 7, True) print("Sampling non-parallelized sparse_dot_topn function ... ", end='', flush=True) C, C_ntop = awesome_cossim_topn_0_threads(a, b, N, thresh) print("Finished.") print("Sampling threaded function with 1 thread ... ", end='', flush=True) C, C_ntop = awesome_cossim_topn_1_thread(a, b, N, thresh) print("Finished.") print("Sampling threaded function with 2 threads ... ", end='', flush=True) C, C_ntop = awesome_cossim_topn_2_threads(a, b, N, thresh) print("Finished.")
def align_publications(df1, df2=None, columns2match_exact=['Year'], column2match_approx='Title', ntop=1, cosine_lower_bound=0.75, use_threads=False, n_jobs=2, lev_lower_bound=0.9, show_progress=False): """ Fast way to match publications between two datasets. We first match subsets of exact values between the two DataFrames, as specified by `columns2match_exact`. We then use a fast approximate string matching to match values in `columns2match_approx` to within a threshold. Parameters ---------- :param df1 : DataFrame A DataFrame with the publication information. :param df2 : DataFrame, Optional Another DataFrame with the publication information. If None, then df1 is used again. :param columns2match_exact : list, Default: ['Year'] The columns to match exactly between DataFrames. :param column2match_approx : list, Default: 'Title' The column to match approximately between DataFrames. :param ntop : int, Default 1 The number of best matches from df2 to return for each row of df1. :param lower_bound : float, Default 0.75 The lowerbound for cosine similarity when doing a fuzzy string match. :param use_threads : bool, Default False Use multithreading when calculating cosine similarity for fuzzy string matching. :param n_jobs : int, Optional, Default 2 If use_threads is True, the number of threads to use in the parall calculation. :param show_progress : bool, Default False If True, show a progress bar tracking the calculation. """ # we can do an exact match from merge if (columns2match_exact is None or len(columns2match_exact) > 0) and (column2match_approx is None or len(column2match_approx) == 0): # get the index name and reset the index to force it as a column indexcol = df2.index.name df2 = df2.reset_index(drop=False) # now merge the dataframes and drop duplicates giving an exact match mdf = df1.merge(df2[columns2match_exact + [indexcol]], how='left', on=columns2match_exact) mdf.drop_duplicates(subset=columns2match_exact, keep='first', inplace=True) return mdf[indexcol] # otherwise, if there is a column to match approximately then we need to prep for fuzzy matching elif len(column2match_approx) > 0: # we take a two-step approach to fuzzy matching # 1) first we employ a super fast but not very accurate cosine-similarity # matching to narrow down the possible pair-wise matches # for each string, we create feature vectors from 3-char counts tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False) tfidf1 = tfidf.fit_transform(df1[column2match_approx]) tfidf2 = tfidf.transform(df2[column2match_approx]) matches = np.empty(tfidf1.shape[0]) matches[:] = np.NaN # if there are no columns to match exactly if (columns2match_exact is None or len(columns2match_exact) == 0): # 1) first do the all-to-all cosine similarity and extract up to the ntop best possible matches co= awesome_cossim_topn(tfidf1, tfidf2.T, ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo() # 2) now use the Levenshtein for row in tqdm(set(co.row), desc="Align Publications", disable=not show_progress): rowcol = co.col[co.row==row] argmatch, lev_dist = levenshtein_best_match(df1.loc[row, column2match_approx], df2.iloc[rowcol][column2match_approx]) if lev_dist >= lev_lower_bound: matches[row] = rowcol[argmatch] else: df2groups = df2.groupby(columns2match_exact) def subgroup_match(subdf): if not df2groups.indices.get(subdf.name, None) is None: sub_tfidf1 = tfidf1[subdf.index.values] sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]] co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo() # 2) now use the Levenshtein distance to find the best match for row in set(co.row): rowcol = co.col[co.row==row] argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx], df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx]) if lev_dist >= lev_lower_bound: matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]] # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Publication Matches', disable= not show_progress) df1.groupby(columns2match_exact, group_keys=True).progress_apply(subgroup_match) return matches
add_vals_to_lookup(group, row, col) else: # if we get here, we need to add a new group. # The name is arbitrary, so just make it the row add_vals_to_lookup(row, row, col) # Grab the column you'd like to group, filter out duplicate values # and make sure the values are Unicode vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer) vals = df['NewName'].unique().astype('U') # Build the matrix!!! tfidf_matrix = vectorizer.fit_transform(vals) cosine_matrix = awesome_cossim_topn(tfidf_matrix, tfidf_matrix.transpose(), vals.size, 0.8) # Build a coordinate matrix coo_matrix = cosine_matrix.tocoo() # for each row and column in coo_matrix # if they're not the same string add them to the group lookup for row, col in zip(coo_matrix.row, coo_matrix.col): if row != col: add_pair_to_lookup(vals[row], vals[col]) df['Group'] = df['NewName'].map(group_lookup) #.fillna(df['NewName']) print(df['Group'].isna().sum()) #df.to_csv('./dol-data-grouped.csv') ###
# sublinear_tf=True ) features = Pipeline( steps=[ ('vect', vect), ('tfidf', tfidf) ] ) tfidf_matrix = features.fit_transform(tmp) cosine_matrix = awesome_cossim_topn( tfidf_matrix, tfidf_matrix.transpose(), # vals.size, 10, 0.1, use_threads=True, n_jobs=3 ) pprint(cosine_matrix) pprint(get_csr_ntop_idx_data(cosine_matrix[2], 5)) pprint(Article.query.get(3).summary) pprint(Article.query.get(11).summary)
time_lev_2 = total / 10 print( f"Time saved over Levenshtein distance database: {time_lev_1 - time_lev_2}s ({time_lev_1/time_lev_2}x speedup)" ) print("TF-IDF matching") conn.row_factory = lambda cursor, row: row[0] songs = conn.execute("select title_en from anisong").fetchall() vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) tf_idf_matrix_songs = vectorizer.fit_transform(songs) total = 0 for i in range(1, 11): start = time.time() tf_idf_matrix_test = vectorizer.transform([test_song]) matches = awesome_cossim_topn(tf_idf_matrix_test, tf_idf_matrix_songs.transpose(), 1, 0) print(matches) end = time.time() duration = end - start total += duration print(f"Iteration {i}: {duration}") song2 = songs[matches.nonzero()[1][0]] print(f"Matched {song2} expected {expected_song} certainty {matches.data[0]}") print(f"Avg duration: {total/10}") time_tf = total / 10 #print(f"Time saved over Levenshtein distance: {time_lev_2 - time_tf}s ({time_lev_2/time_tf}x speedup)")
def cosine_similarity(from_vector: np.ndarray, to_vector: np.ndarray, from_list: List[str], to_list: List[str], nbest, min_similarity: float = 0, method: str = "sparse") -> pd.DataFrame: """ Calculate similarity between two matrices/vectors and return best matches Arguments: from_vector: the matrix or vector representing the embedded strings to map from to_vector: the matrix or vector representing the embedded strings to map to from_list: The list from which you want mappings to_list: The list where you want to map to min_similarity: The minimum similarity between strings, otherwise return 0 similarity method: The method/package for calculating the cosine similarity. Options: "sparse", "sklearn", "knn". Sparse is the fastest and most memory efficient but requires a package that might be difficult to install. Sklearn is a bit slower than sparse and requires significantly more memory as the distance matrix is not sparse Knn uses 1-nearest neighbor to extract the most similar strings it is significantly slower than both methods but requires little memory Returns: matches: The best matches between the lists of strings Usage: Make sure to fill the `to_vector` and `from_vector` with vector representations of `to_list` and `from_list` respectively: ```python from polyfuzz.models import extract_best_matches indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse") ``` """ if nbest != None: if int(nbest) > len(to_list): raise ValueError('best choice must be less than to_list') else: nbest = int(1) # Slower but uses less memory if method == "knn": if from_list == to_list: knn = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='cosine').fit(to_vector) distances, indices = knn.kneighbors(from_vector) distances = distances[:, 1] indices = indices[:, 1] else: knn = NearestNeighbors(n_neighbors=1, n_jobs=-1, metric='cosine').fit(to_vector) distances, indices = knn.kneighbors(from_vector) similarity = [round(1 - distance, 3) for distance in distances.flatten()] # Fast, but does has some installation issues elif _HAVE_SPARSE_DOT and method == "sparse": if isinstance(to_vector, np.ndarray): to_vector = csr_matrix(to_vector) if isinstance(from_vector, np.ndarray): from_vector = csr_matrix(from_vector) # There is a bug with awesome_cossim_topn that when to_vector and from_vector # have the same shape, setting topn to 1 does not work. Apparently, you need # to it at least to 2 for it to work if int(nbest) <= 1: similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, 2, min_similarity) elif int(nbest) > 1: similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, nbest, min_similarity) if from_list == to_list: similarity_matrix = similarity_matrix.tolil() similarity_matrix.setdiag(0.) similarity_matrix = similarity_matrix.tocsr() if int(nbest) <= 1 and method == "sparse": indices = np.array(similarity_matrix.argmax(axis=1).T).flatten() similarity = similarity_matrix.max(axis=1).toarray().T.flatten() elif int(nbest) > 1 and method == "sparse": similarity = np.flip(np.take_along_axis(similarity_matrix.toarray(), np.argsort(similarity_matrix.toarray(), axis =1), axis=1) [:,-nbest:], axis=1) indices = np.flip(np.argsort(np.array(similarity_matrix.toarray()), axis =1)[:,-nbest:], axis=1) # Faster than knn and slower than sparse but uses more memory else: similarity_matrix = scikit_cosine_similarity(from_vector, to_vector) if from_list == to_list: np.fill_diagonal(similarity_matrix, 0) indices = similarity_matrix.argmax(axis=1) similarity = similarity_matrix.max(axis=1) # Convert results to df if int(nbest) <= 1: matches = [to_list[idx] for idx in indices.flatten()] matches = pd.DataFrame(np.vstack((from_list, matches, similarity)).T, columns=["From", "To", "Similarity"]) matches.Similarity = matches.Similarity.astype(float) matches.loc[matches.Similarity < 0.001, "To"] = None else: matches = [np.array([to_list[idx] for idx in l]) for l in indices] ##In progress column = ["To"] for i in range(nbest - 1): column.append("BestMatch" + "__" + str(i+1)) column.append("Similarity") for j in range(nbest - 1): column.append("Similarity" + "__" + str(j+1)) matches = pd.concat([pd.DataFrame({'From' : from_list}), pd.DataFrame(np.hstack((matches, similarity)), columns= column)], axis =1) matches.Similarity = matches.Similarity.astype(float) matches.loc[matches.Similarity < 0.001, "To"] = None for i in range(nbest - 1): matches.loc[matches.Similarity < 0.001, "BestMatch" + "__" + str(i+1)] = None return matches
import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import rand from sparse_dot_topn import awesome_cossim_topn N = 10 a = rand(100, 1000000, density=0.005, format='csr') b = rand(1000000, 200, density=0.005, format='csr') c = awesome_cossim_topn(a, b, 5, 0.01)
def sparse_dot_product(A, B, ntop, lower_bound): '''dot product of two saprse matrices''' return awesome_cossim_topn(A, B,ntop=ntop, lower_bound=lower_bound)
def cosine_similarity(from_vector: np.ndarray, to_vector: np.ndarray, from_list: List[str], to_list: List[str], min_similarity: float = 0.75, top_n: int = 1, method: str = "sparse") -> pd.DataFrame: """ Calculate similarity between two matrices/vectors and return best matches Arguments: from_vector: the matrix or vector representing the embedded strings to map from to_vector: the matrix or vector representing the embedded strings to map to from_list: The list from which you want mappings to_list: The list where you want to map to min_similarity: The minimum similarity between strings, otherwise return 0 similarity top_n: The number of best matches you want returned method: The method/package for calculating the cosine similarity. Options: "sparse", "sklearn", "knn". Sparse is the fastest and most memory efficient but requires a package that might be difficult to install. Sklearn is a bit slower than sparse and requires significantly more memory as the distance matrix is not sparse Knn uses 1-nearest neighbor to extract the most similar strings it is significantly slower than both methods but requires little memory Returns: matches: The best matches between the lists of strings Usage: Make sure to fill the `to_vector` and `from_vector` with vector representations of `to_list` and `from_list` respectively: ```python from polyfuzz.models import extract_best_matches indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse") ``` """ if to_list is not None: if top_n > len(set(to_list)): top_n = len(set(to_list)) # Slower but uses less memory if method == "knn": if to_list is None: knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector) distances, indices = knn.kneighbors(from_vector) distances = distances[:, 1:] indices = indices[:, 1:] else: knn = NearestNeighbors(n_neighbors=top_n, n_jobs=-1, metric='cosine').fit(to_vector) distances, indices = knn.kneighbors(from_vector) similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])] # Fast, but does has some installation issues elif _HAVE_SPARSE_DOT and method == "sparse": if isinstance(to_vector, np.ndarray): to_vector = csr_matrix(to_vector) if isinstance(from_vector, np.ndarray): from_vector = csr_matrix(from_vector) # There is a bug with awesome_cossim_topn that when to_vector and from_vector # have the same shape, setting topn to 1 does not work. Apparently, you need # to it at least to 2 for it to work similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity) if to_list is None: similarity_matrix = similarity_matrix.tolil() similarity_matrix.setdiag(0.) similarity_matrix = similarity_matrix.tocsr() indices = _top_n_idx_sparse(similarity_matrix, top_n) similarities = _top_n_similarities_sparse(similarity_matrix, indices) indices = np.array(np.nan_to_num(np.array(indices, dtype=np.float), nan=0), dtype=np.int) # Faster than knn and slower than sparse but uses more memory else: similarity_matrix = scikit_cosine_similarity(from_vector, to_vector) if to_list is None: np.fill_diagonal(similarity_matrix, 0) indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n] similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n] similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])] # Convert results to df if to_list is None: to_list = from_list.copy() columns = (["From"] + ["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] + ["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)]) matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])] matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns) # Update column order columns = [["From", "To", "Similarity"]] + [[f"To_{i+2}", f"Similarity_{i+2}"] for i in range((top_n-1))] matches = matches.loc[:, [title for column in columns for title in column]] # Update types for column in matches.columns: if "Similarity" in column: matches[column] = matches[column].astype(float) matches.loc[matches[column] < 0.001, column] = float(0) matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None return matches
def _get_cosine_matrix(self, vals): tf_idf_matrix = self._get_tf_idf_matrix(vals) return awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), vals.size, self._match_threshold)
def cos_sim_query(query_vector, query_space, n_neighbors=50, lower_bound=0.0, beta = 1, gamma = 1, n_jobs = None, n_batches = 100): '''make cos similarity query of query_vector on query_space beta is a weightening factor such that query_space = normalize(query_space^beta) beta greater than one ensure higher magnitude components recieves more importance when querying returns idx, sim ''' query_vector, query_space = copy.deepcopy(query_vector), copy.deepcopy(query_space) query_vector, query_space = transform_similarity_weights(query_vector, query_space, beta, gamma) print(f'Querying {n_neighbors} nearest neighbors, this can take a while...') if not scipy.sparse.issparse(query_vector): query_vector = scipy.sparse.csr_matrix(query_vector) if not scipy.sparse.issparse(query_space): query_space = scipy.sparse.csr_matrix(query_space) try: query_space = query_space.T if n_jobs is None: batches = make_batches(query_vector, batch_size = np.ceil(query_vector.shape[0]/n_batches).astype(int)) sim_matrix = [awesome_cossim_topn(qv, query_space,ntop=n_neighbors, lower_bound=lower_bound,) for qv in tqdm(batches)] sim_matrix = scipy.sparse.vstack(sim_matrix) else: batches = make_batches(query_vector, batch_size = np.ceil(query_vector.shape[0]/n_batches).astype(int)) sim_matrix = Parallel(n_jobs=n_jobs, verbose=1, **_joblib_parallel_args(prefer="threads"))( delayed(awesome_cossim_topn)(qv, query_space, ntop=n_neighbors, lower_bound=lower_bound) for qv in batches) sim_matrix = scipy.sparse.vstack(sim_matrix) sim_matrix = scipy.sparse.csr_matrix(sim_matrix) print('Postprocessing query results...') idx = [] sim = [] arr_sizes = [] for d in sim_matrix: s = d.data i = d.nonzero()[1] sim.append(s) idx.append(i) arr_sizes.append(len(s)) max_size = max(arr_sizes) idx = np.array([pad_to_shape(i, max_size) for i in idx]).astype(int) sim = np.array([pad_to_shape(s, max_size) for i in sim]) if idx.shape[1] == 0: raise ValueError('No similarity greater than lower_bound found. Choose a lower threshold.') return idx, sim except NameError: #in case sparse_dot_topn is not instaled print('''sparse_dot_topn not installed. Neighbors query will use sklearn NearestNeighbor, which may take a while for sparse matrix query''') dist, idx = ( NearestNeighbors(n_neighbors = n_neighbors, radius = 1 - lower_bound, metric = 'cosine', n_jobs = -1) .fit(query_space) .kneighbors(query_vector) ) return idx, 1 - dist # <- cos_sim = 1 - cos_dist