class TestSearcher(TestCase): strings = ['a', 'ab', 'abc', 'abcd', 'abcde'] def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure()) def test_search(self): self.assertEqual(self.searcher.search('a', 1.0), ['a']) self.assertEqual(self.searcher.search('ab', 1.0), ['ab']) self.assertEqual(self.searcher.search('ab', 0.9), ['ab']) self.assertEqual(self.searcher.search('ab', 0.5), ['ab', 'abc', 'abcd'])
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD): """Given a string s and a DB name, returns whether at least one string in the associated simstring DB likely contains s as an (approximate) substring.""" if threshold == 1.0: # optimized (not hugely, though) for this common case __import_simstring() db = ssdb_open(dbname) if SIMSTRING_BINARY: __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) else: searcher = Searcher(db, OverlapMeasure()) result = searcher.search(s, threshold) db.close() for r in result: if s in r: return True return False else: # naive implementation for everything else return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
def output_similar_strings_of_each_line(path, measure): strings = [] with open(path, "r") as lines: for line in lines: strings.append(line.rstrip("\r\n")) db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in strings: db.add(string) db.save("companies.db") dbl = DictDatabase.load("companies.db") searcher = Searcher(dbl, measure) profiler.start() for string in strings: result = searcher.search(string, 0.8) # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)] # print("\t".join([string, ",".join(result)])) profiler.stop() profiler.print() profiler.open_in_browser()
def _(bm): searcher = Searcher(db, LeftOverlapMeasure(db)) with open(path, "r") as lines: for i, line in enumerate(lines): if i >= SEARCH_COUNT_LIMIT: break strings = line.rstrip("\r\n") result = searcher.search(strings, 0.8)
def _(bm): searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for i, line in enumerate(lines): if i >= SEARCH_COUNT_LIMIT: break strings = line.rstrip('\r\n') result = searcher.search(strings, 0.8)
class TestSearcher(TestCase): strings = ["a", "ab", "abc", "abcd", "abcde"] def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure()) def test_search1(self): self.assertEqual(self.searcher.search("a", 1.0), ["a"]) def test_search2(self): self.assertEqual(self.searcher.search("ab", 0.5), ["ab", "abc", "abcd"]) self.assertEqual(self.searcher.search("ab", 1.0), ["ab"]) self.assertEqual(self.searcher.search("ab", 0.9), ["ab"]) def test_search3(self): self.assertEqual(self.searcher.search("abc", 1.0), ["abc"]) self.assertEqual(self.searcher.search("abc", 0.9), ["abc"]) def test_search4(self): self.assertEqual(self.searcher.search("abcd", 1.0), ["abcd"]) self.assertEqual(self.searcher.search("abcd", 0.9), ["abcd"]) def test_ranked_search(self): self.assertEqual(self.searcher.ranked_search("abcd", 1.0), OrderedDict({"abcd": 1.0})) self.assertEqual( self.searcher.ranked_search("ab", 0.41), OrderedDict({ "ab": 1.0, "abc": 0.5773502691896258, "abcd": 0.5163977794943222, "abcde": 0.47140452079103173, }), )
def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3): """search similar words by using edit distance""" searcher = Searcher(self.db, measure) t = initial_threshold similar_words = [] while True: similar_words = searcher.search(query, t) if len(similar_words) >= k or t <= 0.1: break t -= dec_step if len(similar_words) > 3: np.random.choice(42) return np.random.choice(similar_words, k, replace=False).tolist() else: return similar_words
def ssdb_supstring_lookup(s, dbname, threshold=DEFAULT_THRESHOLD, with_score=False): """Given a string s and a DB name, returns the strings in the associated simstring DB that likely contain s as an (approximate) substring. If with_score is True, returns pairs of (str,score) where score is the fraction of n-grams in s that are also found in the matched string. """ db = ssdb_open(dbname) if SIMSTRING_BINARY: __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) else: searcher = Searcher(db, OverlapMeasure()) result = searcher.search(s, threshold) db.close() # The simstring overlap measure is symmetric and thus does not # differentiate between substring and superstring matches. # Replicate a small bit of the simstring functionality (mostly the # ngrams() function) to filter to substrings only. s_ngrams = ngrams(s) filtered = [] for r in result: if s in r: # avoid calculation: simple containment => score=1 if with_score: filtered.append((r, 1.0)) else: filtered.append(r) else: r_ngrams = ngrams(r) overlap = s_ngrams & r_ngrams if len(overlap) >= len(s_ngrams) * threshold: if with_score: filtered.append((r, 1.0 * len(overlap) / len(s_ngrams))) else: filtered.append(r) return filtered
def ssdb_lookup(s, dbname, measure=DEFAULT_SIMILARITY_MEASURE, threshold=DEFAULT_THRESHOLD): """Given a string and a DB name, returns the strings matching in the associated simstring DB.""" db = ssdb_open(dbname) if SIMSTRING_BINARY: __set_db_measure(db, measure) db.threshold = threshold result = db.retrieve(s) else: searcher = Searcher(db, __get_pure_measure(measure)) result = searcher.search(s, threshold) db.close() return result
class GESSimpleMatcher: ''' Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl). Actualmente considera un extractor de features que combina caracteres y palabras y tiene ciertas cosas específicas de textos GES. TODO: - probar técnicas un poco más sofisticadas de matching - completar la documentación ''' def __init__( self, base_ges_data='ges_utils/data/ges-health-problems.json', no_ges_str='UNK', alpha=0.2, n_chars=4, n_words=[2], special_words=['vih'] ): self.alpha = alpha with open(base_ges_data,'r',encoding='utf-8') as f: self.__ges_dict = json.load(f) # feature extractor extractor = GESSyntacticFeatureExtractor( n_chars=n_chars, n_words=n_words, special_words=special_words ) self.__db = DictDatabase(extractor) # Caché self.__cache = {} self.__problems_from_disease = defaultdict(set) self.__ids_from_disease = defaultdict(set) self.__problems = {} self.__ids = {} self.__problems[-1] = no_ges_str self.__ids[no_ges_str] = -1 # Por ahora los ids son el orden de los problemas en el json # TODO: decidir si los ids deberían obtenerse de algún lugar estándar for i, problem in enumerate(self.__ges_dict): problem_id = i+1 self.__problems[problem_id] = problem self.__ids[problem] = problem_id # agrega un problema como si fuera disease también self.__problems_from_disease[problem].add(problem) self.__ids_from_disease[problem].add(problem_id) # agrega a las BD self.__db.add(problem) for disease in self.__ges_dict[problem]: self.__problems_from_disease[disease].add(problem) self.__ids_from_disease[disease].add(problem_id) # agrega a la BD self.__db.add(disease) # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas self.__searcher = Searcher(self.__db, CosineMeasure()) def get_ranking_ges_diseases(self, raw_string): ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha) return ranking def get_ges_problem(self, raw_string): problem_id = self.get_ges_id(raw_string) problem = self.__problems[problem_id] return problem def get_ges_id(self, raw_string): # si ya lo computamos entrega el valor if raw_string in self.__cache: return self.__cache[raw_string] # si no lo tenemos, lo computamos ranking = self.get_ranking_ges_diseases(raw_string) if ranking: # ipdb.set_trace() (v, disease) = ranking[0] problem_ids = self.__ids_from_disease[disease] problem_id = list(problem_ids)[0] self.__cache[raw_string] = problem_id return problem_id else: self.__cache[raw_string] = -1 return -1 def get_possible_ges_ids(self, raw_string): to_search = raw_string problem_ids = [] # busca las enfermedades candidatas candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) for disease in candidate_diseases: problem_ids.extend(self.__ids_from_disease[disease]) problem_ids_counter = Counter(problem_ids) ordered_ids = [i for i,_ in problem_ids_counter.most_common()] return ordered_ids def get_ges_id_prev(self, raw_string): # si ya lo computamos entrega el valor if hash(raw_string) in self.__cache: return self.__cache[hash(raw_string)] ids_list = self.get_possible_ges_ids(raw_string) if not ids_list: self.__cache[raw_string] = -1 return -1 else: self.__cache[raw_string] = ids_list[0] return ids_list[0] def problem_from_id(self, id_problem): return self.__problems[id_problem] def id_from_problem(self, problem): return self.__ids[problem] def clean_cache(self): self.__cache = {}
def make_change_image_dict(drink_names): import re import json import difflib from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import CosineMeasure from simstring.database.dict import DictDatabase from simstring.searcher import Searcher ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig") json_data2 = json.load(ff) ff.close() # 互いに類似度を比較する文字列のリスト STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names] TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2} TCD_name_db = list(TCD_db.keys()) count = 0 length = len(STR_db) result_dict = {} change_image_dict = {} db = DictDatabase(CharacterNgramFeatureExtractor(2)) for str1 in STR_db: db.add(str1) for str2 in TCD_name_db: result_dict[str2] = {} searcher = Searcher(db, CosineMeasure()) i = 1.0 # 類似度を計算、0.0~1.0 で結果が返る flag = False for str1 in STR_db: s = difflib.SequenceMatcher(None, str2, str1).ratio() if s > 0.75: flag = True if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [s ,1]) temp = [] while i >= 0.65: result = searcher.search(str2, i) if (len(result)): flag = True for str1 in result: if (str1 in temp): continue temp += [str1] if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [i ,1]) i -= 0.001 if (flag): count += 1 with open("./search_log.txt", "w+", encoding="utf-8_sig") as f: real_count = 0 for str2 in TCD_name_db: print("\n", file=f) print("\n") print(">> "+str2, file=f) print(">> "+str2) M = 0.0 name = "" for key, value_list in result_dict[str2].items(): if (M < value_list[0]): name = key M = value_list[0] print(" "+name+": "+str(M), file=f) if (M != 0): if (M >= 0.76): print(" "+name+": "+str(M)) print("ok", file=f) print("ok") change_image_dict[name] = TCD_db[str2] real_count += 1 else: print(" "+name+": "+str(M)) print("out", file=f) print("out") print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f) print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length)) exit() return change_image_dict
def similarity(word): searcher = Searcher(db, CosineMeasure()) return np.array(searcher.search(normalize('NFKC', word), 0.65))
def search_term_sims(self, term: str) -> List[str]: searcher = Searcher(self.db, CosineMeasure()) return searcher.search(term, 0.8)