def miss_out_logo_in_corner(self, table, logo_string, cell_coords=(-1, -1)): ''' right bottom corner often you can read the logo: 'differencebetween.net with a wrong detected tab border, delete it by fuzzy matching on the end of the string. ''' right_bottom = table[cell_coords[0]][cell_coords[1]] if ratio(right_bottom[-30:], logo_string) > 0.5: ratios = [ratio(right_bottom[-r:], logo_string) for r in range(35)] element_to_key = lambda i: ratios[i] arg_max_ratios = max(range(len(ratios)), key=element_to_key) + 1 table[cell_coords[0]][cell_coords[1]] = right_bottom.replace(right_bottom[-arg_max_ratios:], "").strip() return table else: return table
def clean_duplicates(df): maxes = {} for i, row in df.iterrows(): lev = _levenshtein.ratio(row.title_x, row.title_y) if row.title_x not in maxes: maxes[row.title_x] = (i, lev) else: if lev > maxes[row.title_x][1]: maxes[row.title_x] = (i, lev) maxes = [val[0] for val in maxes.values()] indexes = df.index.values.tolist() return list(set(indexes) - set(maxes))
def run_picktab(self, path, gold): res = self.t.process(path, logo_string='[Dap| DifferenceBetween.net') import pprint pprint.pprint(res) for y, r in enumerate(res): for x, c in enumerate(r): try: self.assertGreater(ratio(res[y][x].lower(), gold[y][x].lower()), 0.96) except: print(res[y][x], " <!=> ", gold[y][x]) raise
def levenshtein_comparison(results_df: pd.DataFrame): print("Size of result_df: {}".format(results_df.shape[0])) cashtags_not_found = self.not_found['title'].tolist() results_df['keep'] = False match_count = 0 for i, row in results_df.iterrows(): curr_title = row['title'].lower() ratios = {} for tag_not_found in cashtags_not_found: # lev = difflib.SequenceMatcher(None, tag, title).ratio() lev = _levenshtein.ratio(curr_title, tag_not_found) ratios[tag_not_found] = lev matched_string = max(ratios, key=lambda key: ratios[key]) matched_ratio = ratios[matched_string] if matched_ratio == 1.0: results_df.at[i, 'keep'], match_count = True, match_count+1 results_df = results_df[results_df.keep] results_df = results_df.drop(columns=['keep']) print("Size of result_df: {}".format(results_df.shape[0])) return results_df
def levenshtein_comparison(df): results = pd.DataFrame() tickers = pd.read_csv('./utils/secwiki_tickers.csv') rows = {} for _, row in df.iterrows(): if row.key == "NASDAQ|FBMS": continue match_col = tickers.loc[tickers['Ticker'] == row.symbol] if not match_col.empty and not pd.isna(match_col.Name.values[0]): lev = _levenshtein.ratio(row.title, match_col.Name.values[0]) if lev > 0.55: if row.key in rows and lev > rows[row.key][1]: rows[row.key] = [row, lev] else: rows[row.key] = [row,lev] print(row.dbpDescription[:50], row.key, row.title, colored("MATCH", "blue"), match_col.Name.values[0], match_col.Ticker.values[0], lev) for _, v in rows.items(): results = results.append(v[0], ignore_index = True) #print(results[['dbpDescription', 'exchange', 'key', 'symbol', 'title']]) sys.exit(0) return results
def jaro(sentence, query): score = _levenshtein.ratio(sentence, query) return score
def compare(term, name): return ratio(simplify(term), simplify(name))