def get_best_matches(text, to_compare, top_n=5, case_sensitive=True, include_percentage=False): if case_sensitive: compare = lambda a, b: fuzz.ratio(a, b) else: compare = lambda a, b: fuzz.ratio(a.lower(), b.lower()) result = [(compare(text, s), s) for s in to_compare] result.sort(key=lambda x: x[0], reverse=True) return_lambda = lambda tup: tup if include_percentage else tup[1] return [return_lambda(tup) for tup in result[0:top_n]]
def get_candidate_lines(keylines, lines, meta): """ For each key line get the candidate line from file lines with similarity metrics and line position """ results = [] # Heuristic 1: limit the search scope with the best candidate # for the LAST reference line last_line_candidates = [] # Find the best candidate for position, line in enumerate(lines): similarity = fuzz.ratio(keylines[-1]['line'], line) last_line_candidates.append((similarity, position, line)) cand_score, cand_pos, cand_line = max(last_line_candidates) # if it is not a random match (at least 55%) # and it cointains number "3", limit the search scope # (because in rare cases sections 2 and 3 are swapped) if (cand_score > 55 and '3' in cand_line): search_scope = lines[:cand_pos + 1] # +1 because the line itself is very valuable for matching else: search_scope = lines # Heuristic 2: for very short documents (2-5 pages) # there often will be only sections with brief info # no adresses and detailed descriptions, # so we dont take some lines into account if 1 < meta['page_count'] < 5: keyline_scope = [kl for kl in keylines if not kl['helper_line']] else: keyline_scope = keylines # Now pick the best candidate for each reference line for keyline in keyline_scope: candidates = [] for position, line in enumerate(search_scope): similarity = fuzz.ratio(keyline['line'], line) candidates.append({ 'line': line, 'similarity': similarity, 'position': position }) best_match = max(candidates, key=lambda c: c['similarity']) results.append({'keyline': keyline, **best_match}) return results
def invalid_embed_generate(self, pride_leader: str) -> discord.Embed: """ Generates Invalid Embed. The invalid embed contains a list of closely matched names of the invalid pride leader the user gave. If no closely matched names are found it would list all the available pride leader names. Wikipedia is a useful place to learn about pride leaders and we don't have all the pride leaders, so the bot would add a field containing the wikipedia command to execute. """ embed = discord.Embed(color=constants.Colours.soft_red) valid_names = [] pride_leader = pride_leader.title() for name in PRIDE_RESOURCE: if fuzz.ratio(pride_leader, name) >= MINIMUM_FUZZ_RATIO: valid_names.append(name) if not valid_names: valid_names = ", ".join(PRIDE_RESOURCE) error_msg = "Sorry your input didn't match any stored names, here is a list of available names:" else: valid_names = "\n".join(valid_names) error_msg = "Did you mean?" embed.description = f"{error_msg}\n```\n{valid_names}\n```" embed.set_footer( text="To add more pride leaders, feel free to open a pull request!" ) return embed
def init_notebooks(self) -> None: self.assignments = {} self.notebooks = [] assignment_glob = self._format_source(self.coursedir.assignment_id, self.coursedir.student_id) for assignment in glob.glob(assignment_glob): notebook_glob = os.path.join(assignment, self.coursedir.notebook_id + ".ipynb") found = glob.glob(notebook_glob) if len(found) == 0: self.log.warning("No notebooks were matched by '%s'", notebook_glob) continue self.assignments[assignment] = found if len(self.assignments) == 0: msg = "No notebooks were matched by '%s'" % assignment_glob self.log.error(msg) assignment_glob2 = self._format_source("*", self.coursedir.student_id) found = glob.glob(assignment_glob2) if found: scores = sorted([(fuzz.ratio(assignment_glob, x), x) for x in found]) self.log.error("Did you mean: %s", scores[-1][1]) raise NbGraderException(msg)
def has_wake_word(self, phrase): phrase_parts = phrase.split() test_word = False start_index = 0 retn = False if len(phrase_parts) == 1: test_word = phrase_parts[0] self.heard = "" elif len(phrase_parts) > 1: prefixes = ["ok", "hey"] test_word = False first_word, second_word = phrase_parts[0:2] extracted_processes = process.extract(first_word, prefixes) for extracted_process in extracted_processes: if extracted_process[1] > 80: test_word = second_word start_index = 2 if not test_word: test_word = first_word start_index = 1 if test_word and isinstance(test_word, str): fuzzed = fuzz.ratio(test_word.lower(), self.wake_word.lower()) retn = fuzzed >= 80 if retn: self.heard = " ".join(phrase_parts[start_index::]) return retn
def get_matched_entries(s, field_values, m_theta=0.85, s_theta=0.85): if not field_values: return None if isinstance(s, str): n_grams = split(s) else: n_grams = s matched = dict() for field_value in field_values: if not isinstance(field_value, string_types): continue fv_tokens = split(field_value) sm = difflib.SequenceMatcher(None, n_grams, fv_tokens) match = sm.find_longest_match(0, len(n_grams), 0, len(fv_tokens)) if match.size > 0: source_match = get_effecitve_match_source(n_grams, match.a, match.a + match.size) if source_match and source_match.size > 1: match_str = field_value[match.b:match.b + match.size] source_match_str = s[source_match.start:source_match.start + source_match.size] c_match_str = match_str.lower().strip() c_source_match_str = source_match_str.lower().strip() c_field_value = field_value.lower().strip() if c_match_str and not utils.is_number( c_match_str) and not utils.is_common_db_term( c_match_str): if utils.is_stopword(c_match_str) or utils.is_stopword(c_source_match_str) or \ utils.is_stopword(c_field_value): continue if c_source_match_str.endswith(c_match_str + '\'s'): match_score = 1.0 else: if prefix_match(c_field_value, c_source_match_str): match_score = fuzz.ratio(c_field_value, c_source_match_str) / 100 else: match_score = 0 if (utils.is_commonword(c_match_str) or utils.is_commonword(c_source_match_str) or utils.is_commonword(c_field_value) ) and match_score < 1: continue s_match_score = match_score if match_score >= m_theta and s_match_score >= s_theta: if field_value.isupper( ) and match_score * s_match_score < 1: continue matched[match_str] = (field_value, source_match_str, match_score, s_match_score, match.size) if not matched: return None else: return sorted(matched.items(), key=lambda x: (1e16 * x[1][2] + 1e8 * x[1][3] + x[1][4]), reverse=True)
def get_matches(self, file, files): matches = [f for f in files if f[-1] == 'ipynb'] sims = [fuzz.ratio(file[0], m[1]) for m in matches] best = sorted(range(len(sims)), key=sims[::-1].__getitem__) matches = list(map(lambda i: matches[i], best)) sims = list(map(lambda i: sims[i], best)) return matches, sims
def binary_fuzzy_match(pat, txt, threshold, local=1): """ Searches for fuzzy matches to a pattern in a longer string. A fuzzy match does not necessarily need to be a perfect character for character match between a pattern and the larger text string, with a tolerance for mismatches controlled by the threhsold parameter. The underlying metric is Levenshtein distance. Args: pat (str): The shorter text to search for. txt (str): The larger text to search within. threshold (int): Value between 0 and 1 at which matches are considered real. local (int, optional): Alignment method, 0 for global 1 for local. Returns: boolean: True if the pattern was found, false if it was not. """ # Make sure the pattern is smaller than the text. if len(pat) > len(txt): return (False) similarity_score = 0.000 if local == 1: similarity_score = fuzz.partial_ratio(pat, txt) else: similarity_score = fuzz.ratio(pat, txt) if similarity_score >= threshold * 100: return (True) return (False)
def get_combined_fuzz_score(self, a, b, mode='geom_mean'): a, b = clean_name(a), clean_name(b) simple = float(fuzz.ratio(a, b) * self.weight['simple']) partial = float(fuzz.partial_ratio(a, b) * self.weight['partial']) return self.combine_scores(simple, partial, mode=mode)
def find_similar(search_for, dataset): res = [] for data in dataset: res.append(fuzz.ratio(search_for, data)) i, v = max(enumerate(res), key=operator.itemgetter(1)) yield dataset[i] yield v
def subseq_matcher(seq1, seq2): """ Match similar lines """ ls_grid = np.zeros((len(seq1), len(seq2))) for subseq1_index, subseq1 in enumerate(seq1): for subseq2_index, subseq2 in enumerate(seq2): ra = ratio(subseq1, subseq2) ls_grid[subseq1_index][subseq2_index] = ra if ra > 30 else 0 max_val = np.argwhere(ls_grid == np.amax(ls_grid)) while ls_grid[max_val[0][0]][max_val[0][1]] != 0.0: if len(max_val) != 1: max_val = [max_val[np.argmin([np.abs(x - y) for x, y in max_val])]] ls_grid[:, max_val[0][1]], ls_grid[max_val[0][0], :] = 0, 0 ls_grid[max_val[0][0]][max_val[0][1]] = -1 max_val = np.argwhere(ls_grid == np.amax(ls_grid)) matched_seq = [] if len(seq1) <= len(seq2): for col_id, col in enumerate(ls_grid.T): match = np.argwhere(col == -1) if len(match) == 0: matched_seq.append(["", seq2[col_id]]) else: matched_seq.append([seq1[match[0][0]], seq2[col_id]]) if col_id < len(seq1) and np.sum(ls_grid[col_id][:]) != -1: matched_seq.append([seq1[col_id], ""]) else: for row_id, col in enumerate(ls_grid): match = np.argwhere(col == -1) if len(match) == 0: matched_seq.append([seq1[row_id], ""]) else: matched_seq.append([seq1[row_id], seq2[match[0][0]]]) if row_id < len(seq2) and np.sum(ls_grid.T[row_id, :]) != -1: matched_seq.append(["", seq2[row_id]]) return matched_seq
def find_similar_pairs(tags, *, required_similarity=80): """ Find pairs of similar-looking tags in the collection ``tags``. Increase ``required_similarity`` for stricter matching (=> less results). """ for t1, t2 in itertools.combinations(sorted(tags), 2): if fuzz.ratio(t1, t2) > required_similarity: yield (t1, t2)
def search(self, name, threshold=80): matches = [] for manufacturer in self.manufacturers: for variant in manufacturer: # Search with false name ratio = fuzz.ratio(variant.lower(), name.lower()) if ratio > threshold: matches.append((manufacturer[0], ratio)) return sorted(matches, key=lambda x: x[1], reverse=True)
def _assignment_not_found(self, src_path, other_path): msg = "Assignment not found at: {}".format(src_path) self.log.fatal(msg) found = glob.glob(other_path) if found: scores = sorted([(fuzz.ratio(self.src_path, x), x) for x in found]) self.log.error("Did you mean: %s", scores[-1][1]) raise ExchangeError(msg)
def find_similarity(col1, col2): if algo == "rapidfuzz": similarity_score = fuzz.ratio(col1, col2) elif algo == "editdistance": similarity_score = editdistance.eval(col1, col2) else: is_junk = None similarity_score = SequenceMatcher(is_junk, col1, col2).ratio() return similarity_score
def calc_order_score(document_keylines, ordered): ''' Levenstein distance approach on lists: Encode each element with single letter and calculate the distance ''' mapping = {line: chr(i + 100) for i, line in enumerate(document_keylines)} document_keylines_decoded = ''.join([mapping[line] for line in document_keylines]) ordered_decoded = ''.join([mapping[line] for line in ordered]) return fuzz.ratio(document_keylines_decoded, ordered_decoded)
def get_fuzzy_list(utterances: list, intent_ids: list): results = [[utterance, [], 0] for utterance in utterances] for i, (utterance, intent_id) in enumerate(zip(utterances, intent_ids)): logger.info(f"Processing utterance {i + 1} of {len(utterances)}.") for j, choice in enumerate(utterances[i + 1:]): if fuzz.ratio(utterance, choice, score_cutoff=90): results[i][2] += 1 results[j + i + 1][2] += 1 results[i][1].append(f"Row {j + i + 3}: [{intent_ids[j + i + 1]}] {choice}") results[j + i + 1][1].append(f"Row {i + 2}: [{intent_ids[i]}] {utterance}") return [fuzzy_matches for _, fuzzy_matches, _ in results]
def _get_caption(self, imgname: str) -> List[str]: max_similarity = 0 match = None for imgpath in self.img2caption.keys(): if imgname in imgpath: similarity = ratio(imgname, imgpath) if similarity > max_similarity: match = imgpath max_similarity = similarity # Lookup best match return self.img2caption[match]
def match(self, from_list, to_list): # Calculate distances matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list] for from_string in from_list] # Get best matches mappings = [to_list[index] for index in np.argmax(matches, axis=1)] scores = np.max(matches, axis=1) # Prepare dataframe matches = pd.DataFrame({'From': from_list, 'To': mappings, 'Similarity': scores}) return matches
def is_words_similar(string, model): """ Calculates the Levenshtein distance between two strings :param string: user input :param model: model string :return: Is words are similar :rtype: bool """ if fuzz.ratio(string, model, score_cutoff=75): return True return False
def search_by_url(data: dict, url: str, topn: int = 5) -> List[tuple]: assert url logger.info(f"Searching for url={url}") res = [] url = url.lower() for item_id, vals in data["list"].items(): dest_url = vals.get("resolved_url", "").lower() if dest_url: score = fuzz.ratio(url, dest_url) res.append((score, item_id, vals)) res = sorted(res, key=lambda x: x[0], reverse=True)[:topn] if res and res[0][0] > 95: return res[:1] return res
def get_best_result(title, configfile, dbfile): try: sj_results = get(title, configfile, dbfile, sj_only=True)[1] except: return False results = [] i = len(sj_results) j = 0 while i > 0: try: q = "result" + str(j + 1000) results.append(sj_results.get(q).get('title')) except: pass i -= 1 j += 1 best_score = 0 best_match = 0 for r in results: r = re.sub(r"\s\(.*\)", "", r) score = fuzz.ratio(title, r) if score > best_score: best_score = score best_match = i + 1000 i += 1 + 1000 best_match = 'result' + str(best_match) try: best_title = sj_results.get(best_match).get('title') if not re.match(r"^" + title.replace(" ", ".") + r".*$", best_title, re.IGNORECASE): best_title = False best_payload = sj_results.get(best_match).get('payload') except: best_title = False if not best_title: logger.debug('Kein Treffer fuer die Suche nach ' + title + '! Suchliste ergänzt.') listen = ["List_ContentShows_Shows", "List_ContentAll_Seasons"] for liste in listen: cont = ListDb(dbfile, liste).retrieve() if not cont: cont = "" if title not in cont: ListDb(dbfile, liste).store(title) return False logger.debug('Bester Treffer fuer die Suche nach ' + title + ' ist ' + best_title) return best_payload
def findItemName(self, itemDictionary, messageItem): bestScore = 0 score = 0 bestItem = None try: for itemName, itemLabel in list(itemDictionary.items()): score = fuzz.ratio(messageItem, itemLabel, score_cutoff=bestScore) if score > bestScore: bestScore = score bestItem = itemName except KeyError: pass return bestItem
def match( left: pd.Series, right: pd.Series, preprocess: bool = False, fuzzy: bool = False, threshold: float = 0.8, ) -> pd.Series: """ Compares values between two different Series to check if they match. Parameters ---------- left : Series Left Series. right : Series Right Series. preprocess : bool Whether to clean and standardize values before comparing them. fuzzy : bool Whether to compare values using fuzzy logic. threshold : float Threshold to define equal values using fuzzy logic. Returns ------- Series Series with booleans indicating whether the values match. """ if preprocess: left = standardize_text(left) right = standardize_text(right) if fuzzy: values = pd.DataFrame({"left": left, "right": right}) values = values.fillna("") score = values.apply(lambda row: fuzz.ratio(row["left"], row["right"]), axis=1) result = (score / 100) >= threshold else: result = left == right nanmask = right.isna() result.loc[nanmask] = np.nan result = result.astype("boolean") return result
def get_potential(iterable: Iterable, *, threshold: int = 80) -> list[str]: nonlocal name potential = [] for item in iterable: original, item = item, item.lower() if name == item: return [original] a, b = fuzz.ratio(name, item), fuzz.partial_ratio(name, item) if a >= threshold or b >= threshold: potential.append(original) return potential
def fuzzy_matcher(features, document, match=None): matches = [] tokens = nltk.word_tokenize(document) for feature in features: feature_length = len(feature.split(" ")) for i in range(len(tokens) - feature_length + 1): matched_phrase = "" j = 0 for j in range(i, i + feature_length): if re.search(r'[,!?{}\[\]]', tokens[j]): break matched_phrase = matched_phrase + " " + tokens[j].lower() matched_phrase.strip() if not matched_phrase == "": if fuzz.ratio(matched_phrase, feature.lower()) > match: matches.append([matched_phrase, feature, i, j]) return matches
def get_possible_sds_count(final_score, meta, last_section_candidates, section3_anchor): ''' Main desicion function. Possible SDS/NON SDS calculation and attempt to count concatenated SDSs inside big files. For relatively huge docs with proper final score lets count probable SDS count. We assume these docs as concatenated multi SDS files. Approach is simple: count top candidates for last (most representative) keyline with really high similarity. As this line can vary, we match against several candidates. ''' if final_score < 45: # Basic NON SDS Case sds_count = 0 elif final_score >= 45 and meta['page_count'] < MULTI_SDS_MIN_PAGE_COUNT: sds_count = 1 elif final_score >= 45 and meta['page_count'] >= MULTI_SDS_MIN_PAGE_COUNT: sds_count = 0 # Because we count all SDSs here for position, line in enumerate(meta['all_lines']): if section3_anchor in line: for candidate_line, min_similarity in last_section_candidates.items(): similarity = fuzz.ratio(candidate_line, line) if similarity > min_similarity: # Special cases for bad (but very similar) lines: # Doesnt have quotes in them: # INVALID LINE EXAMPLE: 5.1.3 sds section 3 "composition/information on ingredients" # Doesnt start with specific symbols, like "(" or # "1" (because "11" can be a bad OCR of double quote) # INVALID LINE EXAMPLE: (composition/information on ingredients) . bad_line = (line[0] in ['(', '1']) or ('"' in line) if not bad_line: sds_count += 1 continue # Dont test a line anymore if already matched # Edge cases: ratio between page count and sds count cant be very low # If we observe 1 or 2 pages per SDS - its definately a layout problem. # Good example: B74F61F216D24EB5ABBABA08101EABF6.ashx.pdf, which has # all secions repeated as agenda at each page if sds_count: if meta['page_count'] / sds_count <= 2: sds_count = 1 return sds_count
def sort_found_entities( self, candidate_entities: List[Tuple[int, str, int]], candidate_names: List[List[str]], entity: str, context: str = None ) -> Tuple[List[str], List[float], List[Tuple[str, str, int, int]]]: entities_ratios = [] for candidate, entity_names in zip(candidate_entities, candidate_names): entity_num, entity_id, num_rels, tokens_matched = candidate fuzz_ratio = max( [fuzz.ratio(name.lower(), entity) for name in entity_names]) entities_ratios.append( (entity_num, entity_id, tokens_matched, fuzz_ratio, num_rels)) srtd_with_ratios = sorted(entities_ratios, key=lambda x: (x[2], x[3], x[4]), reverse=True) if self.use_descriptions: log.debug(f"context {context}") id_to_score = { entity_id: (tokens_matched, score) for _, entity_id, tokens_matched, score, _ in srtd_with_ratios[:30] } entity_ids = [ entity_id for _, entity_id, _, _, _ in srtd_with_ratios[:30] ] scores = self.entity_ranker.rank_rels(context, entity_ids) entities_with_scores = [(entity_id, id_to_score[entity_id][0], id_to_score[entity_id][1], score) for entity_id, score in scores] entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) entities_with_scores = [entity for entity in entities_with_scores if \ (entity[3] > self.descr_rank_score_thres or entity[2] == 100.0)] log.debug(f"entities_with_scores {entities_with_scores[:10]}") entity_ids = [entity for entity, _, _, _ in entities_with_scores] confidences = [score for _, _, _, score in entities_with_scores] else: entity_ids = [ent[1] for ent in srtd_with_ratios] confidences = [float(ent[2]) * 0.01 for ent in srtd_with_ratios] return entity_ids, confidences, srtd_with_ratios
def search_by_title(data: dict, title: str, topn: int = 5) -> List[tuple]: assert title logger.info(f"Searching for title={title}") res = [] title = title.lower() for item_id, vals in data["list"].items(): dest_title = vals.get("resolved_title", "").lower() if dest_title: score = fuzz.ratio(title, dest_title) res.append((score, item_id, vals)) # if dest_title == title: # score = 100 # res.append((score, item_id, vals)) res = sorted(res, key=lambda x: x[0], reverse=True)[:topn] if res and res[0][0] > 95: return res[:1] return res
def handle_presence_intent(self, message): self._setup() if self.fhem is None: self.speak_dialog('fhem.error.setup') return wanted = message.data["entity"] LOG.debug("wanted: %s" % wanted) try: roommates = self.fhem.get(room=self.allowed_devices_room, device_type='ROOMMATE') except ConnectionError: self.speak_dialog('fhem.error.offline') return if len(roommates) < 1: self.speak_dialog('fhem.presence.error') return presence = None bestRatio = 66 for rm in roommates: if 'rr_realname' in rm['Attributes'].keys(): realname = rm['Attributes'][rm['Attributes']['rr_realname']] LOG.debug("realname: %s" % realname) ratio = fuzz.ratio(wanted.lower(), realname.lower(), score_cutoff=bestRatio) LOG.debug("ratio: %s" % ratio) if ratio > bestRatio: presence = rm['Readings']['presence']['Value'] bestName = realname bestRatio = ratio presence_values = self.translate_namedvalues('presence.value') if presence: location = presence_values[presence] self.speak_dialog('fhem.presence.found', data={ 'wanted': bestName, 'location': location }) else: self.speak_dialog('fhem.presence.error')