def get_matches(df, col, input, option): if option == 'Starts with': cond = df[col].astype(str).str.lower().str.startswith(input.lower()) dfn = df[cond].copy() dfn.sort_values(by=col, inplace=True, ignore_index=True) elif option == 'Contains': cond = df[col].astype(str).str.lower().str.contains(input.lower()) dfn = df[cond].copy() dfn.sort_values(by=col, inplace=True, ignore_index=True) elif option == 'Most similar': dfn = match_strings(df[col].astype(str).drop_duplicates(), pd.Series(input), min_similarity=0.4) cols = dfn.columns dfn.sort_values(by='similarity', ascending=False, inplace=True, ignore_index=True) dfn = pd.merge(dfn, df, left_on='left_side', right_on=col) dfn.drop(cols, axis=1, inplace=True) return dfn
def calculate_fuzzymatches_for_min_similarity( left: Union[pd.DataFrame, pd.Series], right: Union[pd.DataFrame, pd.Series], on: str = None, left_on: str = None, right_on: str = None, min_similarity: float = None, ) -> pd.DataFrame: if isinstance(left, pd.Series) and isinstance(right, pd.Series): left_clean = left.drop_duplicates().pipe(clean_fuzzy_column) right_clean = right.drop_duplicates().pipe(clean_fuzzy_column) elif on is not None: left_clean = left[on].drop_duplicates().pipe(clean_fuzzy_column) right_clean = right[on].drop_duplicates().pipe(clean_fuzzy_column) elif (left_on is not None) and (right_on is not None): left_clean = left[left_on].drop_duplicates().pipe(clean_fuzzy_column) right_clean = right[right_on].drop_duplicates().pipe(clean_fuzzy_column) else: raise ValueError("Unexpected condition...") match = match_strings(left_clean, right_clean, min_similarity=min_similarity) return match
def dupes_from_as(cfg): column_to_match = 'name_concat' dataset = pd.read_csv(cfg['input_csv']) matches = match_strings(dataset[column_to_match]) # excludes matches with the same index, which are by definition the same. match_subset = matches[matches['left_index'] != matches['right_index']] match_subset.to_csv(cfg['output_csv'], index=False) joined_subset = regroup_data(dataset, match_subset) joined_subset.to_csv(cfg['output_csv_2'], index=False) dropped_dupes = filter_mirrors(joined_subset) dropped_dupes.to_csv(cfg['output_csv_3'], index=False) dropped_subfields = remove_subfields(dropped_dupes) dropped_subfields.to_csv(cfg['output_csv_4'], index=False)
def _get_lookup_matches(self, requested_words): # Create a small set of artifical company names search = pd.Series(pd.Series(requested_words).unique()) lookups = pd.Series(self.lookup["Word"].unique()) # Create all matches: matches = match_strings(lookups, search, ngram_size=3, min_similarity=float( os.environ.get("LOOKUP_THRES_PRETAGGING"))) return matches
def dupes_from_wikidata(cfg): wikidata_sublist = prep_wikidata_sublist(cfg) for key, value in wikidata_sublist.items(): master_match = pd.DataFrame([key], columns=['agent_uri', 'name_concat']) master_match.set_index('agent_uri', inplace=True) dupe_match = pd.DataFrame( value, columns=[ 'index', 'num_matches', 'agent_uri', 'aay_url', 'name_concat', 'sort_name', 'dates', 'resources', 'archival_objects', 'accessions', 'authority_id', 'source', 'create_time', 'wikidata_uri', 'wikidata_name', 'wikidata_begin', 'wikidata_end' ]) dupe_match.set_index('index', inplace=True) matches = match_strings(master_match['name_concat'], dupe_match['wikidata_name']) matches.to_csv(cfg.get('output_csv'), mode='a', header=False, index=False)
words_detected = [] for m_id, start, end in shape_matches: entity = doc[start:end] words_detected.append(clean_text(entity.text)) entities_detected.append((entity.text, entity.start_char, entity.end_char, nlp.vocab.strings[m_id], "")) print(entity.text) req_words = [] for t in doc: if len(t.text ) > 2 and t.text != "break" and not t.is_stop and not clean_text( t.text) in words_detected: print(t.text) req_words.append(t.text.lower()) # Create a small set of artifical company names duplicates = pd.Series(pd.Series(req_words).unique()) string_words = pd.Series(result["Word"].unique()) # Create all matches: matches = match_strings(string_words, duplicates, ngram_size=3, min_similarity=0.4) print(matches)
return False strings = [x for x in strings if '[no title captured]' not in x] articles = [x for x in strings if isarticle(x)] books = [x for x in strings if not isarticle(x)] print("%s articles, %s books to group" % (len(articles), len(books))) # grouping books # this cell may take quite a while to run. # on Intel i7-9700F this runs in about a minute on 185k names. books_grouped = string_grouper.match_strings( pd.Series(books), number_of_processes=8, min_similarity=0.7 ) from collections import defaultdict books_grouped[(books_grouped.similarity<1-1e-8)].sort_values("similarity") # for books, we require that the authors are no more than 1 edit from each other # even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F ft = defaultdict(set) for i,r in books_grouped.iterrows(): ls = r.left_side rs = r.right_side
'/Users/haoranliu/match/Trademark/Clean_name/Clean/cleaned/ciq_id.json' ) as f: ciq_id = json.load(f) tma_assignor = pd.DataFrame() tma_assignor['name'] = tma_assignor_name tma_assignor['id'] = tma_assignor_id ciq = pd.DataFrame() ciq['name'] = ciq_name ciq['id'] = ciq_id # num = 0.9 # matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num) # matches.to_stata(f'assignor_ciq{num}.dta', version = 117) # num = 0.8 # matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num) # matches.to_stata(f'assignor_ciq{num}.dta', version = 117) num = 0.7 matches = match_strings(master=tma_assignor['name'], master_id=tma_assignor['id'], duplicates=ciq['name'], duplicates_id=ciq['id'], min_similarity=num) matches.to_stata(f'assignor_ciq{num}.dta', version=117) # num = 0.6 # matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num) # matches.to_stata(f'assignor_ciq{num}.dta', version = 117) #string_grouper.match_strings() #(master, master_id, duplicates, duplicates_id, min_similarity)
def run(self): from knowknow import pd, Counter, VariableNotFound from collections import defaultdict import string_grouper import editdistance # tracks the last group-id assigned new_gid = 0 print(len(self.strings), 'strings total...') def isarticle(x): sp = x.split("|") if len(sp) < 2: return False try: int(sp[1]) return True except ValueError: return False strings = [x for x in self.strings if '[no title captured]' not in x] articles = [x for x in strings if isarticle(x)] books = [x for x in strings if not isarticle(x)] print('sample articles:', articles[:10]) print('sample books:', books[:10]) print("%s articles, %s books to group" % (len(articles), len(books))) # grouping books # this cell may take quite a while to run. # on Intel i7-9700F this runs in about a minute on 185k names. self.books_grouped = string_grouper.match_strings( pd.Series(books), number_of_processes=8, min_similarity=0.7) # for books, we require that the authors are no more than 1 edit from each other # even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F self.ft = defaultdict(set) for i, r in self.books_grouped.iterrows(): ls = r.left_side rs = r.right_side if ls == rs: continue la = ls.split("|")[0] ra = rs.split("|")[0] if editdistance.eval(la, ra) > 1: continue self.ft[ls].add(rs) self.ft[rs].add(ls) print("%s books have some connection to others in a group" % len(self.ft)) # assigns group-ids based on the relational structure derived thus far # the code propagates ids through the network, assuming transitivity of equality for i, k in enumerate(books): if k in self.groups: continue self.traverse(k, new_gid) new_gid += 1 print(len(set(self.groups.values())), 'groups total') print( Counter(gid for x, gid in self.groups.items() if len(x.split("|")) == 2).most_common(10)) # grouping articles # this cell may take quite a while to run. # on Intel i7-9700F this runs in five minutes on 234k entries. self.articles_grouped = string_grouper.match_strings( pd.Series(articles), number_of_processes= 8, # decrease this number to 1 or 2 for slower computers or laptops (the fan might start screaming) min_similarity= 0.8 # the similarity cutoff is tighter for articles than for books ) self.articles_grouped[(self.articles_grouped.similarity < 1 - 1e-8)].sort_values("similarity") # for articles, we require that the entire citations is only 1 edit apart. # even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F # this cell produces the `ft` variable, which maps from each term to the set of terms equivalent. I.e., `ft[A] = {B1,B2,B3}` self.ft = defaultdict(set) for i, r in self.articles_grouped.iterrows(): ls = r.left_side rs = r.right_side if ls == rs: continue la = ls.split("|")[0] ra = rs.split("|")[0] if editdistance.eval(ls, rs) > 2: continue self.ft[ls].add(rs) self.ft[rs].add(ls) #print(ls,"|||",rs) print("%s articles have some connection to others in a group" % len(self.ft)) # assigns group-ids based on the relational structure derived thus far # the code propagates ids through the network, assuming transitivity of equality for i, k in enumerate(articles): if k in self.groups: continue self.traverse(k, new_gid) new_gid += 1 # this line will break execution if there aren't as many groups assigned as we have articles and books assert (len(articles) + len(books) == len(self.groups)) print("%s books and %s articles total" % (len(books), len(articles))) from collections import defaultdict # saving the variable for later self.dataset.save_variable("groups", self.groups) self.dataset.save_variable("group_reps", self.get_reps())
def string_matcher(col1, col2, sim_thresh=0.95): from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper matches = match_strings(col1, col2, min_similarity=sim_thresh) return matches
punti_search = func.test_search_functions(search_functions_to_test, possibilities, src.list_of_searches, src.categories, saveresults=True, savelog=True) # %% codecell # Plot dei grafici dei risultati folder_log = os.path.join(folder,"log_ricerca") func.plot_figure(folder_log, src.categories) # %% # Test per StringGrouper import pandas as pd import numpy as np from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper company_names = os.path.join(os.getcwd(),'sec_edgar_company_info.csv') # We only look at the first 50k as an example companies = pd.read_csv(company_names)[0:50000] c = companies['Company Name'] # Create all matches: matches = match_strings(companies['Company Name']) # Look at only the non-exact matches: matches[matches.left_side != matches.right_side].head() # Create a small set of artifical company names duplicates = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP']) # Create all matches: matches = match_strings(companies['Company Name'], duplicates) matches # Create a small set of artificial company names new_companies = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP']) # Create all matches: matches = match_most_similar(companies['Company Name'], new_companies) # Display the results: pd.DataFrame({'new_companies': new_companies, 'duplicates': matches}) #%%