def main(filepath): linkset = defaultdict(set) df = pd.read_csv(filepath) for r in df.to_dict(orient='records'): dataset = r['dataset'] a = r['owner_a'] b = r['owner_b'] p = token_sort_ratio(a, b) if p > .8: print(round(p, 2), a, b, sep='\t') if r['actType'] not in [ "Boedelinventaris", "Boedelscheiding", "Testament", "Overig", "Huwelijkse voorwaarden", "Kwitantie" ]: continue linkset[r['inventory']].add( (r['dataset'], r['record'], r['actType'])) return linkset
def are_nouns_similar(noun1, noun2): # jaccard, jaro_winkler, hamming, token_sort_ratio jaccardD = jaccard(noun1, noun2) jaro = jaro_winkler(noun1, noun2) lev = levenshtein(noun1, noun2) hammingD = hamming(noun1, noun2) tsr = token_sort_ratio(noun1, noun2) dice = dice_coefficient(noun1, noun2) if lev > 0.42: return True
def aggregate_term_variants(terms, acro_defs=None, fuzzy_dedupe=True): """ Take a set of unique terms and aggregate terms that are symbolic, lexical, and ordering variants of each other, as well as acronyms and fuzzy string matches. Args: terms (Set[str]): set of unique terms with potential duplicates acro_defs (dict): if not None, terms that are acronyms will be aggregated with their definitions and terms that are definitions will be aggregated with their acronyms fuzzy_dedupe (bool): if True, fuzzy string matching will be used to aggregate similar terms of a sufficient length Returns: List[Set[str]]: each item is a set of aggregated terms Notes: Partly inspired by aggregation of variants discussed in Park, Youngja, Roy J. Byrd, and Branimir K. Boguraev. "Automatic glossary extraction: beyond terminology identification." Proceedings of the 19th international conference on Computational linguistics-Volume 1. Association for Computational Linguistics, 2002. """ agg_terms = [] seen_terms = set() for term in sorted(terms, key=len, reverse=True): if term in seen_terms: continue variants = set([term]) seen_terms.add(term) # symbolic variations if '-' in term: variant = term.replace('-', ' ').strip() if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) if '/' in term: variant = term.replace('/', ' ').strip() if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) # lexical variations term_words = term.split() # last_word = term_words[-1] # # assume last word is a noun # last_word_lemmatized = lemmatizer.lemmatize(last_word, 'n') # # if the same, either already a lemmatized noun OR a verb; try verb # if last_word_lemmatized == last_word: # last_word_lemmatized = lemmatizer.lemmatize(last_word, 'v') # # if at least we have a new term... add it # if last_word_lemmatized != last_word: # term_lemmatized = ' '.join(term_words[:-1] + [last_word_lemmatized]) # if term_lemmatized in terms.difference(seen_terms): # variants.add(term_lemmatized) # seen_terms.add(term_lemmatized) # if term is an acronym, add its definition # if term is a definition, add its acronym if acro_defs: for acro, def_ in acro_defs.items(): if acro.lower() == term.lower(): variants.add(def_.lower()) seen_terms.add(def_.lower()) break elif def_.lower() == term.lower(): variants.add(acro.lower()) seen_terms.add(acro.lower()) break # if 3+ -word term differs by one word at the start or the end # of a longer phrase, aggregate if len(term_words) > 2: term_minus_first_word = ' '.join(term_words[1:]) term_minus_last_word = ' '.join(term_words[:-1]) if term_minus_first_word in terms.difference(seen_terms): variants.add(term_minus_first_word) seen_terms.add(term_minus_first_word) if term_minus_last_word in terms.difference(seen_terms): variants.add(term_minus_last_word) seen_terms.add(term_minus_last_word) # check for "X of Y" <=> "Y X" term variants if ' of ' in term: split_term = term.split(' of ') variant = split_term[1] + ' ' + split_term[0] if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) # intense de-duping for sufficiently long terms if fuzzy_dedupe is True and len(term) >= 13: for other_term in sorted(terms.difference(seen_terms), key=len, reverse=True): if len(other_term) < 13: break tsr = token_sort_ratio(term, other_term) if tsr > 0.93: variants.add(other_term) seen_terms.add(other_term) break agg_terms.append(variants) return agg_terms
def test_empty(self, text_pairs): for text1, text2 in text_pairs: assert similarity.token_sort_ratio(text1, "") == 0.0
def test_identity(self, text_pairs): for text1, text2 in text_pairs: assert similarity.token_sort_ratio(text1, text1) == pytest.approx(1.0, rel=1e-3) assert similarity.token_sort_ratio(text2, text2) == pytest.approx(1.0, rel=1e-3)
def test_default(self, text_pairs): for text1, text2 in text_pairs: assert 0.0 <= similarity.token_sort_ratio(text1, text2) <= 1.0