def init(): global orgDB, symbDBs, prefixDB, suffixDB, prefixCounts, suffixCounts orgDB = simstring.reader("data/all_species_db") orgDB.measure = simstring.overlap orgDB.threshold = 1 symbDBs = [] for dbName in ("default_symbol", "official_symbol", "official_name", "synonym", "homologene_id", "ensemblcluster_id", "ensembl_id", "locustag", "cd_antigen", "description", "locusname", "alergen"): dbFile = "data/all_symbols_%s_db" % dbName try: simDB = simstring.reader(dbFile) except IOError: continue simDB.measure = simstring.overlap simDB.threshold = 1 symbDBs.append((dbName, simDB)) prefixDB = simstring.reader("data/all_prefixes_db") prefixDB.measure = simstring.overlap prefixDB.threshold = 1 suffixDB = simstring.reader("data/all_suffixes_rev_db") suffixDB.measure = simstring.overlap suffixDB.threshold = 1 prefixCounts = readAffixFile("src_data/prefixes2.txt", False) suffixCounts = readAffixFile("src_data/suffixes2.txt", True) print >> sys.stderr, "INIT DONE" return symbDBs, orgDB
def match_dictionaries(doc, place_set, department_set, university_set, person_set, matching='approx'): words = doc.words() if matching == 'approx': import simstring # load simstring dbs place_db = simstring.reader('dicts' + os.sep + 'places.db') department_db = simstring.reader('dicts' + os.sep + 'departments.db') university_db = simstring.reader('dicts' + os.sep + 'universities.db') person_db = simstring.reader('dicts' + os.sep + 'people.db') dbs = [place_db, department_db, university_db, person_db] for db in dbs: db.measure = simstring.cosine db.threshold = 0.9 # check all unigrams for word in words: tok = word.text.lower().strip() if matching == 'exact': if tok in place_set: word.place_score = 1 else: word.place_score = 0 if tok in department_set: word.department_score = 1 else: word.department_score = 0 if tok in university_set: word.university_score = 1 else: word.university_score = 0 if tok in person_set: word.person_score = 1 else: word.person_score = 0 elif matching == 'approx': # print(tok) # print(type(tok)) tok = tok.encode('ascii', 'ignore') # print(type(tok)) sys.stdout.flush() word.place_score = 0 word.place_score = 1 if len( place_db.retrieve(tok)) > 0 else word.place_score word.department_score = 0 word.department_score = 1 if len( department_db.retrieve(tok)) > 0 else word.department_score word.university_score = 0 word.university_score = 1 if len( university_db.retrieve(tok)) > 0 else word.university_score word.person_score = 1 if len(person_db.retrieve(tok)) > 0 else 0
def __init__(self, dbfn, ngram_length=SimstringBase.DEFAULT_NGRAM_LENGTH, include_marks=SimstringBase.DEFAULT_INCLUDE_MARKS, threshold=SimstringBase.DEFAULT_THRESHOLD, similarity_measure=SimstringBase.DEFAULT_SIMILARITY_MEASURE, unicode=SimstringBase.DEFAULT_UNICODE, build=False): assert include_marks == False, "Error: begin/end marks not supported" assert ngram_length == 3, "Error: unsupported n-gram length" super().__init__(dbfn, ngram_length=ngram_length, include_marks=include_marks, threshold=threshold, similarity_measure=similarity_measure, unicode=unicode, build=build) if build: self.db = simstring.writer(self.dbfn) else: self.db = simstring.reader(self.dbfn) self.db.measure = SIMILARITY_MEASURES[similarity_measure] self.db.threshold = threshold
def check_dictionary_db(token, database): dict_db = simstring.reader(database) dict_db.measure = simstring.cosine dict_db.threshold = 0.9 if len(dict_db.retrieve(token.encode('utf-8'))) > 0: return 1 return 0
def __init__(self, db_file, measure=COSINE, threshold=.8): self.db = simstring.reader(db_file) if measure == COSINE: self.db.measure = simstring.cosine else: self.db.measure = simstring.overlap self.db.threshold = threshold
def modify_match_dis_normform(data_folder, annoDict, db_file, len_text, min_threshold=0.01): print data_folder + "simstring/" + db_file db = simstring.reader(data_folder + "simstring/" + db_file) db.measure = simstring.cosine predict_dict = dict() for doc_id, items in annoDict.iteritems(): j = 0 for item in items: db.threshold = 1.0 mention = getNormform_space(item[2]) match_concept = db.retrieve(mention) while match_concept == () and db.threshold > min_threshold: db.threshold = db.threshold - 0.01 match_concept += db.retrieve(mention) if len(match_concept) == 1 and len(mention) > len_text: predict_dict.setdefault(doc_id, []) for concept in set(match_concept): predict_dict[doc_id].append( [item[0], item[1], concept, '9606', db.threshold]) j += 1 return predict_dict
def cell_match_simstring(ss_folder, annoDict, ab3p_dict, ms, mapping_dict, db_file, normform, min_threshold=0.01): distance_matrix = {0: "exact", 1: "dice", 2: "cosine", 3: "jaccard", 4: "overlap"} db = simstring.reader(ss_folder + db_file) db.measure = ms predict_dict = dict() for doc_id, items in annoDict.iteritems(): predict_dict.setdefault(doc_id, []) for j, item in enumerate(items): predict_dict[doc_id].append([]) db.threshold = 1.0 try: cell = ab3p_dict[doc_id][item[2]] except: cell = item[2] if normform: mention = getNormform(cell) else: mention = cell match_concept = db.retrieve(mention) while match_concept == () and db.threshold > min_threshold: db.threshold = db.threshold - 0.01 match_concept = db.retrieve(mention) if len(match_concept) == 0: predict_dict[doc_id][j].append([item[0], item[1], item[2], set([]), 0]) else: for k, concept in enumerate(list(set(match_concept))): predict_dict[doc_id][j].append([item[0], item[1], concept, set([]), db.threshold]) for concept_id in mapping_dict[concept]: predict_dict[doc_id][j][k][3].add(concept_id) return predict_dict
def open_simstring_db(dbname): try: db = simstring.reader(dbname) except: print >> sys.stderr, "Error opening simstring DBs for reading" raise return db
def __init__(self, path, similarity_name, threshold): if not (os.path.exists(path)) or not (os.path.isdir(path)): err_msg = ( '"{}" does not exists or it is not a directory.').format(path) raise IOError(err_msg) self.db = simstring.reader( prepare_string_for_db_input( os.path.join(path, 'umls-terms.simstring'))) self.db.measure = getattr(simstring, similarity_name) self.db.threshold = threshold
def __init__(self, words, measure=3, n=3, be=True, unicode=True, file="sample.db"): self.n = n subprocess.check_output("mkdir -p db", shell=True) db = simstring.writer(f'./db/{file}', n, be, unicode) db.measure = measure for w in words: db.insert(w) db.close() db = simstring.reader(f"./db/{file}") db.measure = measure self.db = db
def test_matches(): """ Just tests reading from the databases. """ dbs = ['people.db', 'places.db', 'departments.db', 'universities.db'] for dbname in dbs: db = simstring.reader(os.path.join(fe_settings.simstringdb_dir, dbname)) db.measure = simstring.cosine db.threshold = 0.6 print(db.retrieve(u'london'.encode('utf-8')))
def ssdb_open(dbname): ''' Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. ''' __import_simstring() try: return simstring.reader(__ssdb_filename(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise # TODO: raise specific exception
def score_string(word_text, dbname): """ Impliments the simstring matching. Assumes the presence of the simstring databases. """ db = simstring.reader(os.path.join(fes.simstringdb_dir, dbname)) db.measure = simstring.cosine db.threshold = 0.6 if db.retrieve(word_text.encode('utf-8')): return 1 else: return 0
def ssdb_open(dbname): ''' Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError try: return simstring.reader(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)
def ssdb_open(dbname): """Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. """ try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError try: return simstring.reader(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)
def ssdb_open(dbname): """Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. """ __import_simstring() try: if SIMSTRING_BINARY: return simstring.reader(__ssdb_path(dbname)) else: fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH) db = SQLite3Database(fx) return db.use(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)
def normalize_org(org_dict, abbr_dict): print 'load organism mapping files ...' data_folder = os.getcwd() + '/' ncbi_symbol_dict, id_rank_dict, tax_tree, all_under_spp, lower_rank_map, all_rank_count, model_org = load_normalize_org_mapping(data_folder) min_threshold = 0.6 ss_folder = data_folder + "simstring/" ncbi_db = 'new_all_org/new_all_org.db' db = simstring.reader(ss_folder + ncbi_db) db.measure = simstring.cosine pred_tests = match_org_simstring(ss_folder, org_dict, ncbi_symbol_dict, db, min_threshold, abbr_dict, all_under_spp) combined_tests = combine_prediction(ss_folder, org_dict, pred_tests, id_rank_dict, tax_tree, ncbi_symbol_dict, all_rank_count, model_org, all_under_spp, lower_rank_map, abbr_dict) final_dict = {} temp_dict = {} for k, v in combined_tests.iteritems(): for anno in v: temp_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], anno[3], anno[5]]) if anno[3] == '': final_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], 'org', 'organism:' + anno[2]]) else: final_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], 'org', 'NCBI taxon:' + anno[3]]) return temp_dict, final_dict, all_rank_count # combined_tests
def __init__(self, directory, filename, measure=simstring.overlap, threshold=0.65, mode='write'): if not (filename.endswith('.db') and os.path.isdir(directory)): raise ValueError( "Incorrect file format for Database. Database must end with .db" ) else: self.writer = None self.reader = None if mode == 'write': self.writer = simstring.writer( os.path.join(directory, filename)) else: self.reader = simstring.reader( os.path.join(directory, filename)) self.reader.measure = measure self.reader.threshold = threshold
def chem_match_simstring(ss_folder, annoDict, db_file, umls_chemical_symbol): db = simstring.reader(ss_folder + db_file) db.measure = simstring.cosine pred_dict = copy.deepcopy(annoDict) found_dict = dict() for doc_id, items in annoDict.iteritems(): for j, item in enumerate(items): pred_dict[doc_id][j].append([]) pred_dict[doc_id][j].append([]) try: pred_dict[doc_id][j][5:8] = found_dict[item[2]] except: for entity in item[4]: db.threshold = 1.0 mention = getNormform(entity) if len(mention) < 5: threshold = 1.0 elif len(mention) in range(5, 11): threshold = 0.8 else: threshold = 0.6 match_concept = db.retrieve(mention) while match_concept == () and db.threshold > threshold: db.threshold = db.threshold - 0.01 match_concept = db.retrieve(mention) if len(match_concept) != 0: for concept in match_concept: if concept not in pred_dict[doc_id][j][5]: pred_dict[doc_id][j][5].append(concept) pred_dict[doc_id][j][7].append(db.threshold) # concepts = socket_check_value(sock, 'umls_chemical_symbol', concept) concepts = umls_chemical_symbol[concept] for concept_id in concepts: pred_dict[doc_id][j][6].append(concept_id) found_dict.setdefault(item[2], pred_dict[doc_id][j][5:8]) return pred_dict
A Unicode sample. We assume that the source code is written in UTF-8 encoding (see the encoding declaration in line 2). We can use 8-bit strings as they are with SimString. """ import simstring # Open a SimString database for writing with Unicode mode. db = simstring.writer('sample_unicode.db', 3, False, True) # Write a string, and close the database. db.insert('スパゲティ') db.close() # Open the SimString database for reading. db = simstring.reader('sample_unicode.db') # Set a similarity measure and threshold. db.measure = simstring.cosine db.threshold = 0.6 # Use an 8-bit string encoded in UTF-8. print(' '.join(db.retrieve('スパゲティー'))) # Convert a Unicode object into an UTF-8 query string. print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))
# -*- coding:utf-8 -*- """ A Unicode sample. We assume that the source code is written in UTF-8 encoding (see the encoding declaration in line 2). We can use 8-bit strings as they are with SimString. """ import simstring # Open a SimString database for writing with Unicode mode. db = simstring.writer('sample_unicode.db', 3, False, True) # Write a string, and close the database. db.insert('スパゲティ') db.close() # Open the SimString database for reading. db = simstring.reader('sample_unicode.db') # Set a similarity measure and threshold. db.measure = simstring.cosine db.threshold = 0.6 # Use an 8-bit string encoded in UTF-8. print(' '.join(db.retrieve('スパゲティー'))) # Convert a Unicode object into an UTF-8 query string. print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))
#!/usr/bin/env python # -*- coding:utf-8 -*- # # Copyright 2014 Martin J Chorley # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import simstring db = simstring.reader('names.db') db.measure = simstring.cosine db.threshold = 0.6 print ', '.join(db.retrieve('Ocho Lounge')) print('\n\n') db.measure = simstring.jaccard db.threshold = 0.6 print ', '.join(db.retrieve('Ocho Lounge'))
for word in f: db.insert(word.strip()) db.close() # create university database db = simstring.writer('dicts' + os.sep + 'universities.db') with open('dicts' + os.sep + 'university_full.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'university_keywords.txt') as f: for word in f: db.insert(word.strip()) db.close() db = simstring.reader('dicts' + os.sep + 'people.db') print("testing person database") print(db.retrieve('aaron')) print(db.retrieve('abe')) db = simstring.reader('dicts' + os.sep + 'places.db') print("testing place database") print(db.retrieve('boston')) print(db.retrieve('china')) db = simstring.reader('dicts' + os.sep + 'departments.db') print("testing department database") print(db.retrieve('medical')) print(db.retrieve('association')) db = simstring.reader('dicts' + os.sep + 'universities.db')
#!/usr/bin/env python import simstring # Create a SimString database with two person names. db = simstring.writer('sample.db') db.insert('Barack Hussein Obama II') db.insert('James Gordon Brown') db.close() # Open the database for reading. db = simstring.reader('sample.db') # Use cosine similarity and threshold 0.6. db.measure = simstring.cosine db.threshold = 0.6 print(db.retrieve('Barack Obama')) # OK. print(db.retrieve('Gordon Brown')) # OK. print(db.retrieve('Obama')) # Too dissimilar! # Use overlap coefficient and threshold 1.0. db.measure = simstring.overlap db.threshold = 1. print(db.retrieve('Obama')) # OK.
out8=codecs.getwriter("utf-8")(sys.stdout) def match(pwd,vocabdb): pwd_lower=pwd.lower() matches=[unicode(m,"utf-8") for m in vocabdb.retrieve(pwd_lower.encode("utf-8"))] #Quick first try - only use exact hits matches=list(set(matches)) matches=[m for m in matches if m in pwd_lower] print >> out8, pwd+u"\t"+json.dumps(matches) if __name__=="__main__": parser=argparse.ArgumentParser(description='Look up using a ready-made web-crawled vocabulary. Pipe the passwords in on stdin.') parser.add_argument('--db', default="simdb/pb34_wf_exc.simdb", help='SimString DB built using index_vocab.sh Default: %(default)s') args=parser.parse_args() vocabdb=simstring.reader(args.db) vocabdb.measure=simstring.overlap vocabdb.threshold=1.0 decode_errors=0 for counter,pwd in enumerate(sys.stdin): pwd=pwd.strip() if not pwd: continue try: #there's some broken utf there? pwd_u=unicode(pwd,"utf-8") except UnicodeDecodeError: decode_errors+=1 continue matches=match(pwd_u,vocabdb)
def read_simstring(): dbpath='simstring_law/law.db' db=simstring.reader(dbpath) db.measure = simstring.cosine db.threshold=0.9 return db