def get_umls_data(): umls_data = pd.read_csv(umls_df_data_path) print(f"Got UMLS data at length {len(umls_data)}") acronyms_umls_df = pd.read_csv(acronyms_dir + os.sep + 'acronyms_terms.csv') umls_data = pd.concat([umls_data, acronyms_umls_df]) cuiless_umls_df = pd.read_csv(cuiless_dir + os.sep + 'cuiless_terms.csv') umls_data = pd.concat([umls_data, cuiless_umls_df]) umls_data.reset_index(inplace=True) heb_umls_list = list(umls_data['HEB'].values) eng_umls_list = list(umls_data[STRING_COLUMN].values) heb_db = DictDatabase(CharacterNgramFeatureExtractor(2)) eng_db = DictDatabase(CharacterNgramFeatureExtractor(2)) for heb_w in heb_umls_list: heb_db.add(heb_w) for eng_w in eng_umls_list: lower_eng_w = eng_w.lower() eng_db.add(lower_eng_w) return heb_db, eng_db, umls_data
def test_features(self): self.assertEqual(CharacterNgramFeatureExtractor().features('abcde'), ['$a_1', 'ab_1', 'bc_1', 'cd_1', 'de_1', 'e$_1']) self.assertEqual( CharacterNgramFeatureExtractor(3).features('abcde'), ['$$a_1', '$ab_1', 'abc_1', 'bcd_1', 'cde_1', 'de$_1', 'e$$_1']) self.assertEqual( CharacterNgramFeatureExtractor().features(u'あいうえお'), ['$あ_1', 'あい_1', 'いう_1', 'うえ_1', 'えお_1', 'お$_1']) # Japanese
def __init__(self, n_chars, n_words, special_words): self.n_chars = n_chars self.special_words = special_words self.__char_feature_extractor = CharacterNgramFeatureExtractor(n_chars) if type(n_words) != list: self.__word_feature_extractors = [WordNgramFeatureExtractor(n_words)] else: self.__word_feature_extractors = [ WordNgramFeatureExtractor(n) for n in n_words ]
def output_similar_strings_of_each_line(path, measure): strings = [] with open(path, "r") as lines: for line in lines: strings.append(line.rstrip("\r\n")) db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in strings: db.add(string) db.save("companies.db") dbl = DictDatabase.load("companies.db") searcher = Searcher(dbl, measure) profiler.start() for string in strings: result = searcher.search(string, 0.8) # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)] # print("\t".join([string, ",".join(result)])) profiler.stop() profiler.print() profiler.open_in_browser()
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): """Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings.""" __import_simstring() dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert include_marks == False, "Error: begin/end marks not supported" if SIMSTRING_BINARY: assert ngram_length == 3, "Error: unsupported n-gram length" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() else: fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH) db = SQLite3Database(fx) db.use(dbfn) for s in strs: db.add(s) except BaseException: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def create_umls_ss_db(umls_kb, char_ngram_len=3, n_max_tokens=5): logging.info('Loading scispacy ...') import spacy self.sci_nlp = spacy.load('en_core_sci_md', disable=['tagger', 'parser', 'ner']) simstring_db = DictDatabase(CharacterNgramFeatureExtractor(char_ngram_len)) # preprocessing aliases and labels logging.info('Preprocessing aliases ...') alias_mapping = defaultdict(set) aliases = [] for cui in umls_kb.get_all_cuis(): cui_aliases = set( [a.lower() for a in umls_kb.get_aliases(cui, include_name=True)]) for alias in cui_aliases: alias_chars = set(alias) if len(alias_chars.intersection(fb_punctuation)) > 0: continue elif alias in en_stopwords: continue elif alias.isnumeric(): continue alias_doc = self.sci_nlp( alias) # use same tokenizer as when splitting medmentions if len(alias_doc ) > n_max_tokens: # gets too big without restrictions continue alias_mapping[alias].add(cui) aliases.append(alias) logging.info('Adding to DB ...') for alias_idx, alias in enumerate(aliases): simstring_db.add(alias) if alias_idx % 1000000 == 0: logging.info('At %d/%d ...' % (alias_idx, len(aliases))) # setting paths db_path = '%s.aliases.%dgram.%dtoks.db' % (umls_kb.umls_version, char_ngram_len, n_max_tokens) map_path = '%s.aliases.%dtoks.map' % (umls_kb.umls_version, n_max_tokens) logging.info('Storing DB ...') with open(db_path, 'wb') as f: pickle.dump(simstring_db, f) logging.info('Storing Alias Mapping ...') with open(map_path, 'wb') as f: alias_mapping = dict(alias_mapping) pickle.dump(alias_mapping, f)
def __init__(self, word2vec): self.w2v = word2vec self.embedding_dim = self.w2v.vector_size self.vocab = set(self.w2v.vocab.keys()) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for vocab_word in self.vocab: self.db.add(vocab_word)
def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add("foo") db.add("bar") db.add("fooo") db.add("food") db.add("fool") db.add("follow") self.searcher = Searcher(db, JaccardMeasure())
class GESSyntacticFeatureExtractor(BaseFeatureExtractor): def __init__(self, n_chars, n_words, special_words): self.n_chars = n_chars self.special_words = special_words self.__char_feature_extractor = CharacterNgramFeatureExtractor(n_chars) if type(n_words) != list: self.__word_feature_extractors = [WordNgramFeatureExtractor(n_words)] else: self.__word_feature_extractors = [ WordNgramFeatureExtractor(n) for n in n_words ] def features(self, string): # lower y unicode normalized_string = string.lower() normalized_string = unidecode(normalized_string) # saca símbolos de puntuación normalized_string = re.sub(r'[,;()/+ -]+',' ',normalized_string) # elimina la información que no es necesaria (correspondiente a la edad) # 'en personas de' normalized_string = re.sub(r'en +personas +(de|desde)?',' ', normalized_string) # 'mayores de XX anos y mas' normalized_string = re.sub(r'(mayor(es)?( *de)?)? *\d+ +anos?( +y +mas)?', ' ', normalized_string) # 'desde XX anos y menores de XX anos' 'menores de XX anos' normalized_string = re.sub(r'(desde +\d+ +anos? +y)? *menores +de +\d+ +anos?', ' ', normalized_string) # 'de XX a XX anos' normalized_string = re.sub(r'de +\d+ a +\d+ anos?', ' ', normalized_string) # saca los ' y ' normalized_string = re.sub(' y ', ' ', normalized_string) # saca los espacios de mas normalized_string = re.sub(' +', ' ', normalized_string) normalized_string = normalized_string.strip() # obtiene las características de caracteres char_features = self.__char_feature_extractor.features(normalized_string) # obtiene las características de palabras word_features = [] for extractor in self.__word_feature_extractors: word_features += extractor.features(normalized_string) # obtiene características de palabras especiales special_features = [] for word in self.special_words: re_word = f'(^{word})|( +{word} +)|({word}$)' if re.search(re_word, normalized_string): special_features.append(word) return char_features + word_features + special_features
def simstring_database(umls, nchar_val): db = DictDatabase(CharacterNgramFeatureExtractor(nchar_val)) term_to_cui = dict() for value in umls: try: data = value.split('\t') cui = data[0] term = data[1].lower() db.add(term) term_to_cui[term] = cui except: continue pickle.dump(db, open('db.pickle', 'wb')) pickle.dump(term_to_cui, open('term_to_cui.pickle', 'wb'))
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [ str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8) ] print("\t".join([strings, ",".join(result)]))
def ssdb_open(dbname): """Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. """ __import_simstring() try: if SIMSTRING_BINARY: return simstring.reader(__ssdb_path(dbname)) else: fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH) db = SQLite3Database(fx) return db.use(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)
def construct_ontology(ontology_data): ''' Create an n-char simstring database and term-to-code mapping to enable rapid ontology querying ''' database = DictDatabase(CharacterNgramFeatureExtractor(2)) term_to_cui = {} for entry in ontology_data: entry_values = entry.split('\t') if len(entry_values) == 2: term = clean_selected_term(entry_values[1]) term_to_cui[term] = entry_values[0].strip() for term in term_to_cui.keys(): term = clean_selected_term(term) database.add(term) return database, term_to_cui
def output_similar_strings_of_each_line(path, Database): number_of_lines = len(open(path).readlines()) with Benchmarker(width=20) as bench: db = Database(CharacterNgramFeatureExtractor(2)) @bench("initialize database({0} lines)".format(number_of_lines)) def _(bm): with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) @bench("search text({0} times)".format(min(number_of_lines, SEARCH_COUNT_LIMIT))) def _(bm): searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for i, line in enumerate(lines): if i >= SEARCH_COUNT_LIMIT: break strings = line.rstrip('\r\n') result = searcher.search(strings, 0.8)
def __init__(self, from_dir: str = None, json_path: str = "mapper.json", umls_words: Iterable[str] = None): # self.db = DictDatabase(WordNgramFeatureExtractor(2)) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) if from_dir: json_path = os.path.join(from_dir, json_path) if os.path.exists(json_path): print(f"initialize {self.__class__.__name__}... Load json") self.umls_dict, self.umls_reverse_dict = self.load_from_json( json_path) self.add_words_to_db(self.umls_dict.keys()) else: print(f"initialize {self.__class__.__name__}... Load dir") self.umls_dict, self.umls_reverse_dict = self.load_umls_dict( from_dir) self.add_words_to_db(self.umls_dict.keys()) self.save_as_json(path=json_path) else: self.add_words_to_db(umls_words)
def setUp(self): self.db = MongoDatabase(CharacterNgramFeatureExtractor(2), database='simstring-test') self.db.reset_collection() for string in self.strings: self.db.add(string)
def test_features(self): self.assertEqual(CharacterNgramFeatureExtractor().features('abcde'), [' a', 'ab', 'bc', 'cd', 'de', 'e ']) self.assertEqual(CharacterNgramFeatureExtractor(3).features('abcde'), [' ab', 'abc', 'bcd', 'cde', 'de ']) self.assertEqual(CharacterNgramFeatureExtractor(4).features('abcde'), [' abc', 'abcd', 'bcde', 'cde ']) self.assertEqual(CharacterNgramFeatureExtractor(5).features('abcde'), [' abcd', 'abcde', 'bcde ']) self.assertEqual(CharacterNgramFeatureExtractor().features(u'あいうえお'), [' あ', 'あい', 'いう', 'うえ', 'えお', 'お ']) # Japanese
def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure())
def setUp(self): self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: self.db.add(string)
############################# # Perform database cleaning # ############################# # Read in branded foods CSV and clean it df = pd.read_csv('branded_food.csv') all_ingredients_final = get_cleaned_ingredients_list(df) # Get a count for all the ingredients to be used by Peter Norvig Implementation ingredients_count = Counter(all_ingredients_final) ############################################## # Peter Norvig SimString Implementation Code # ############################################## # Populate database with all ingredients db = DictDatabase(CharacterNgramFeatureExtractor(2)) for ingredient in all_ingredients_final: db.add(ingredient) # Create searcher object to be used by candidates function searcher = Searcher(db, CosineMeasure()) # Functions def probability(word, N=sum(ingredients_count.values())): """ Returns the probability of the word appearing in the text Usually, correctly spelled words will have a higher count and therefore probability than their mispellings """ return ingredients_count[word] / N
def make_change_image_dict(drink_names): import re import json import difflib from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import CosineMeasure from simstring.database.dict import DictDatabase from simstring.searcher import Searcher ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig") json_data2 = json.load(ff) ff.close() # 互いに類似度を比較する文字列のリスト STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names] TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2} TCD_name_db = list(TCD_db.keys()) count = 0 length = len(STR_db) result_dict = {} change_image_dict = {} db = DictDatabase(CharacterNgramFeatureExtractor(2)) for str1 in STR_db: db.add(str1) for str2 in TCD_name_db: result_dict[str2] = {} searcher = Searcher(db, CosineMeasure()) i = 1.0 # 類似度を計算、0.0~1.0 で結果が返る flag = False for str1 in STR_db: s = difflib.SequenceMatcher(None, str2, str1).ratio() if s > 0.75: flag = True if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [s ,1]) temp = [] while i >= 0.65: result = searcher.search(str2, i) if (len(result)): flag = True for str1 in result: if (str1 in temp): continue temp += [str1] if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [i ,1]) i -= 0.001 if (flag): count += 1 with open("./search_log.txt", "w+", encoding="utf-8_sig") as f: real_count = 0 for str2 in TCD_name_db: print("\n", file=f) print("\n") print(">> "+str2, file=f) print(">> "+str2) M = 0.0 name = "" for key, value_list in result_dict[str2].items(): if (M < value_list[0]): name = key M = value_list[0] print(" "+name+": "+str(M), file=f) if (M != 0): if (M >= 0.76): print(" "+name+": "+str(M)) print("ok", file=f) print("ok") change_image_dict[name] = TCD_db[str2] real_count += 1 else: print(" "+name+": "+str(M)) print("out", file=f) print("out") print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f) print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length)) exit() return change_image_dict
def main(argv): arg = argparser().parse_args(argv[1:]) # only simstring library default supported at the moment (TODO) if SIMSTRING_BINARY: assert DEFAULT_NGRAM_LENGTH == 3, "Error: unsupported n-gram length" assert DEFAULT_INCLUDE_MARKS == False, "Error: begin/end marks not supported" infn = arg.file if arg.database is None: # default database file name bn = splitext(basename(infn))[0] sqldbfn = sqldb_filename(bn) ssdbfn = ssdb_filename(bn) else: sqldbfn = arg.database + '.' + SQL_DB_FILENAME_EXTENSION ssdbfn = arg.database + '.' + SS_DB_FILENAME_EXTENSION if arg.verbose: print("Storing SQL DB as %s and" % sqldbfn, file=sys.stderr) print(" simstring DB as %s" % ssdbfn, file=sys.stderr) start_time = datetime.now() import_count, duplicate_count, error_count, simstring_count = 0, 0, 0, 0 with codecs.open(infn, 'rU', encoding=arg.encoding) as inf: # create SQL DB try: connection = sqlite.connect(sqldbfn) except sqlite.OperationalError as e: print("Error connecting to DB %s:" % sqldbfn, e, file=sys.stderr) return 1 cursor = connection.cursor() # create SQL tables if arg.verbose: print("Creating tables ...", end=' ', file=sys.stderr) for command in CREATE_TABLE_COMMANDS: try: cursor.execute(command) except sqlite.OperationalError as e: print("Error creating %s:" % sqldbfn, e, "(DB exists?)", file=sys.stderr) return 1 # import data if arg.verbose: print("done.", file=sys.stderr) print("Importing data ...", end=' ', file=sys.stderr) next_eid = 1 label_id = {} next_lid = 1 next_pid = dict([(t, 1) for t in TYPE_VALUES]) for i, l in enumerate(inf): l = l.rstrip('\n') # parse line into ID and TYPE:LABEL:STRING triples try: id_, rest = l.split('\t', 1) except ValueError: if error_count < MAX_ERROR_LINES: print( "Error: skipping line %d: expected tab-separated fields, got '%s'" % (i + 1, l), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print( "(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # parse TYPE:LABEL:STRING triples try: triples = [] for triple in rest.split('\t'): type_, label, string = triple.split(':', 2) if type_ not in TYPE_VALUES: print("Unknown TYPE %s" % type_, file=sys.stderr) triples.append((type_, label, string)) except ValueError: if error_count < MAX_ERROR_LINES: print( "Error: skipping line %d: expected tab-separated TYPE:LABEL:STRING triples, got '%s'" % (i + 1, rest), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print( "(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # insert entity eid = next_eid next_eid += 1 try: cursor.execute("INSERT into entities VALUES (?, ?)", (eid, id_)) except sqlite.IntegrityError as e: if error_count < MAX_ERROR_LINES: print("Error inserting %s (skipping): %s" % (id_, e), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print( "(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # insert new labels (if any) labels = set([l for t, l, s in triples]) new_labels = [l for l in labels if l not in label_id] for label in new_labels: lid = next_lid next_lid += 1 cursor.execute("INSERT into labels VALUES (?, ?)", (lid, label)) label_id[label] = lid # insert associated strings for type_, label, string in triples: table = TABLE_FOR_TYPE[type_] pid = next_pid[type_] next_pid[type_] += 1 lid = label_id[label] # TODO if TABLE_HAS_NORMVALUE[table]: normstring = string_norm_form(string) cursor.execute( "INSERT into %s VALUES (?, ?, ?, ?, ?)" % table, (pid, eid, lid, string, normstring)) else: cursor.execute( "INSERT into %s VALUES (?, ?, ?, ?)" % table, (pid, eid, lid, string)) import_count += 1 if arg.verbose and (i + 1) % 10000 == 0: print('.', end=' ', file=sys.stderr) if arg.verbose: print("done.", file=sys.stderr) # create SQL indices if arg.verbose: print("Creating indices ...", end=' ', file=sys.stderr) for command in CREATE_INDEX_COMMANDS: try: cursor.execute(command) except sqlite.OperationalError as e: print("Error creating index", e, file=sys.stderr) return 1 if arg.verbose: print("done.", file=sys.stderr) # wrap up SQL table creation connection.commit() # create simstring DB if arg.verbose: print("Creating simstring DB ...", end=' ', file=sys.stderr) try: if SIMSTRING_BINARY: ssdb = simstring.writer(ssdbfn) for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND): s = row[0] ssdb.insert(s) simstring_count += 1 ssdb.close() else: sys.path.append( join(dirname(abspath(__file__)), '..', 'server', 'src')) from simstring_pure_sqlite3 import SQLite3Database from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.searcher import Searcher fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH) ssdb = SQLite3Database(fx) ssdb.use(connection) for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND): s = row[0] ssdb.add(s) simstring_count += 1 except BaseException: print("Error building simstring DB", file=sys.stderr) raise if arg.verbose: print("done.", file=sys.stderr) cursor.close() # done delta = datetime.now() - start_time if arg.verbose: print(file=sys.stderr) print("Done in:", str(delta.seconds) + "." + str(delta.microseconds / 10000), "seconds", file=sys.stderr) print( "Done, imported %d entries (%d strings), skipped %d duplicate keys, skipped %d invalid lines" % (import_count, simstring_count, duplicate_count, error_count)) return 0