def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): """ Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. """ try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def write_simstring(dkt): dbpath='simstring_law/law.db' db=simstring.writer(dbpath,3,False,True) for k in dkt: if isinstance(k,unicode): k=k.encode('utf-8') db.insert(k)
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): ''' Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def __init__(self, dbfn, ngram_length=SimstringBase.DEFAULT_NGRAM_LENGTH, include_marks=SimstringBase.DEFAULT_INCLUDE_MARKS, threshold=SimstringBase.DEFAULT_THRESHOLD, similarity_measure=SimstringBase.DEFAULT_SIMILARITY_MEASURE, unicode=SimstringBase.DEFAULT_UNICODE, build=False): assert include_marks == False, "Error: begin/end marks not supported" assert ngram_length == 3, "Error: unsupported n-gram length" super().__init__(dbfn, ngram_length=ngram_length, include_marks=include_marks, threshold=threshold, similarity_measure=similarity_measure, unicode=unicode, build=build) if build: self.db = simstring.writer(self.dbfn) else: self.db = simstring.reader(self.dbfn) self.db.measure = SIMILARITY_MEASURES[similarity_measure] self.db.threshold = threshold
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): """Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings.""" __import_simstring() dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert include_marks == False, "Error: begin/end marks not supported" if SIMSTRING_BINARY: assert ngram_length == 3, "Error: unsupported n-gram length" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() else: fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH) db = SQLite3Database(fx) db.use(dbfn) for s in strs: db.add(s) except BaseException: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def __init__(self, words, measure=3, n=3, be=True, unicode=True, file="sample.db"): self.n = n subprocess.check_output("mkdir -p db", shell=True) db = simstring.writer(f'./db/{file}', n, be, unicode) db.measure = measure for w in words: db.insert(w) db.close() db = simstring.reader(f"./db/{file}") db.measure = measure self.db = db
def load_data(csv_file, db_file): db = simstring.writer(db_file, 3, False, True) with open(csv_file, "rb") as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: word = row[0].lower() str_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore') try: db.insert(str_word); except UnicodeEncodeError as e: print word pass db.close()
def __init__(self, path): if not (os.path.exists(path)) or not (os.path.isdir(path)): err_msg = ( '"{}" does not exists or it is not a directory.').format(path) raise IOError(err_msg) else: try: os.makedirs(path) except OSError: pass self.db = simstring.writer( prepare_string_for_db_input( os.path.join(path, 'umls-terms.simstring')), 3, False, True)
def create_dbs(): """ Reads in the files specified in the lists specified in fe_settings and creates simstring databases. """ for name, fnames in [('people', fe_settings.people), ('places', fe_settings.places), ('departments', fe_settings.departments), ('universities', fe_settings.universities)]: out_dbname = os.path.join(fe_settings.simstringdb_dir, name + '.db') # Enable creating the database in unicode mode. group_db = simstring.writer(out_dbname, 3, False, True) for fname in fnames: fname = os.path.join(fe_settings.lexicon_dir, fname) with open(fname, 'r') as file: for line in file: group_db.insert(line.strip()) group_db.close() print 'Wrote: ', out_dbname
def build_simstring_db(strs, name): """ Given a collection of strings and a DB name, builds a simstring database for the strings. Returns the name under which the DB is stored, which is based on but not identical to the given name. """ try: # include pid to assure that there are no clashes. dbfn = os.path.join(DB_BASE_DIRECTORY, name+"."+str(os.getpid())+".db") db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def create_simstring_databases(): """ Create the simstring databases using input files in a directory :return: """ ood_path = ".." + get_dir_separator() + "Data" + get_dir_separator( ) + "dicts" for dicts_file in os.listdir(ood_path): file_name = dicts_file.split(".")[0] if len(file_name.strip()) > 0: simstring_db = simstring.writer(file_name + '.db') for dict_word in open(ood_path + get_dir_separator() + dicts_file, 'r').readlines(): simstring_db.insert(dict_word.strip()) simstring_db.close()
def __init__(self, directory, filename, measure=simstring.overlap, threshold=0.65, mode='write'): if not (filename.endswith('.db') and os.path.isdir(directory)): raise ValueError( "Incorrect file format for Database. Database must end with .db" ) else: self.writer = None self.reader = None if mode == 'write': self.writer = simstring.writer( os.path.join(directory, filename)) else: self.reader = simstring.reader( os.path.join(directory, filename)) self.reader.measure = measure self.reader.threshold = threshold
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): ''' Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. ''' __import_simstring() dbfn = __ssdb_filename(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print >> sys.stderr, "Error building simstring DB" raise return dbfn
def main(argv): arg = argparser().parse_args(argv[1:]) # only simstring library default supported at the moment (TODO) assert DEFAULT_NGRAM_LENGTH == 3, "Error: unsupported n-gram length" assert DEFAULT_INCLUDE_MARKS == False, "Error: begin/end marks not supported" infn = arg.file if arg.database is None: # default database file name bn = splitext(basename(infn))[0] sqldbfn = sqldb_filename(bn) ssdbfn = ssdb_filename(bn) else: sqldbfn = arg.database + '.' + SQL_DB_FILENAME_EXTENSION ssdbfn = arg.database + '.' + SS_DB_FILENAME_EXTENSION if arg.verbose: print("Storing SQL DB as %s and" % sqldbfn, file=sys.stderr) print(" simstring DB as %s" % ssdbfn, file=sys.stderr) start_time = datetime.now() import_count, duplicate_count, error_count, simstring_count = 0, 0, 0, 0 with codecs.open(infn, 'rU', encoding=arg.encoding) as inf: # create SQL DB try: connection = sqlite.connect(sqldbfn) except sqlite.OperationalError as e: print("Error connecting to DB %s:" % sqldbfn, e, file=sys.stderr) return 1 cursor = connection.cursor() # create SQL tables if arg.verbose: print("Creating tables ...", end=' ', file=sys.stderr) for command in CREATE_TABLE_COMMANDS: try: cursor.execute(command) except sqlite.OperationalError as e: print("Error creating %s:" % sqldbfn, e, "(DB exists?)", file=sys.stderr) return 1 # import data if arg.verbose: print("done.", file=sys.stderr) print("Importing data ...", end=' ', file=sys.stderr) next_eid = 1 label_id = {} next_lid = 1 next_pid = dict([(t, 1) for t in TYPE_VALUES]) for i, l in enumerate(inf): l = l.rstrip('\n') # parse line into ID and TYPE:LABEL:STRING triples try: id_, rest = l.split('\t', 1) except ValueError: if error_count < MAX_ERROR_LINES: print("Error: skipping line %d: expected tab-separated fields, got '%s'" % ( i + 1, l), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print("(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # parse TYPE:LABEL:STRING triples try: triples = [] for triple in rest.split('\t'): type_, label, string = triple.split(':', 2) if type_ not in TYPE_VALUES: print("Unknown TYPE %s" % type_, file=sys.stderr) triples.append((type_, label, string)) except ValueError: if error_count < MAX_ERROR_LINES: print("Error: skipping line %d: expected tab-separated TYPE:LABEL:STRING triples, got '%s'" % ( i + 1, rest), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print("(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # insert entity eid = next_eid next_eid += 1 try: cursor.execute( "INSERT into entities VALUES (?, ?)", (eid, id_)) except sqlite.IntegrityError as e: if error_count < MAX_ERROR_LINES: print("Error inserting %s (skipping): %s" % ( id_, e), file=sys.stderr) elif error_count == MAX_ERROR_LINES: print("(Too many errors; suppressing further error messages)", file=sys.stderr) error_count += 1 continue # insert new labels (if any) labels = set([l for t, l, s in triples]) new_labels = [l for l in labels if l not in label_id] for label in new_labels: lid = next_lid next_lid += 1 cursor.execute( "INSERT into labels VALUES (?, ?)", (lid, label)) label_id[label] = lid # insert associated strings for type_, label, string in triples: table = TABLE_FOR_TYPE[type_] pid = next_pid[type_] next_pid[type_] += 1 lid = label_id[label] # TODO if TABLE_HAS_NORMVALUE[table]: normstring = string_norm_form(string) cursor.execute( "INSERT into %s VALUES (?, ?, ?, ?, ?)" % table, (pid, eid, lid, string, normstring)) else: cursor.execute( "INSERT into %s VALUES (?, ?, ?, ?)" % table, (pid, eid, lid, string)) import_count += 1 if arg.verbose and (i + 1) % 10000 == 0: print('.', end=' ', file=sys.stderr) if arg.verbose: print("done.", file=sys.stderr) # create SQL indices if arg.verbose: print("Creating indices ...", end=' ', file=sys.stderr) for command in CREATE_INDEX_COMMANDS: try: cursor.execute(command) except sqlite.OperationalError as e: print("Error creating index", e, file=sys.stderr) return 1 if arg.verbose: print("done.", file=sys.stderr) # wrap up SQL table creation connection.commit() # create simstring DB if arg.verbose: print("Creating simstring DB ...", end=' ', file=sys.stderr) try: ssdb = simstring.writer(ssdbfn) for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND): # encode as UTF-8 for simstring s = row[0].encode('utf-8') ssdb.insert(s) simstring_count += 1 ssdb.close() except BaseException: print("Error building simstring DB", file=sys.stderr) raise if arg.verbose: print("done.", file=sys.stderr) cursor.close() # done delta = datetime.now() - start_time if arg.verbose: print(file=sys.stderr) print("Done in:", str( delta.seconds) + "." + str(delta.microseconds / 10000), "seconds", file=sys.stderr) print("Done, imported %d entries (%d strings), skipped %d duplicate keys, skipped %d invalid lines" % (import_count, simstring_count, duplicate_count, error_count)) return 0
#!/usr/bin/env python # -*- coding:utf-8 -*- """ A Unicode sample. We assume that the source code is written in UTF-8 encoding (see the encoding declaration in line 2). We can use 8-bit strings as they are with SimString. """ import simstring # Open a SimString database for writing with Unicode mode. db = simstring.writer('sample_unicode.db', 3, False, True) # Write a string, and close the database. db.insert('スパゲティ') db.close() # Open the SimString database for reading. db = simstring.reader('sample_unicode.db') # Set a similarity measure and threshold. db.measure = simstring.cosine db.threshold = 0.6 # Use an 8-bit string encoded in UTF-8. print(' '.join(db.retrieve('スパゲティー')))
except sqlite.OperationalError, e: print >> sys.stderr, "Error creating index", e return 1 if arg.verbose: print >> sys.stderr, "done." # wrap up SQL table creation connection.commit() # create simstring DB if arg.verbose: print >> sys.stderr, "Creating simstring DB ...", try: ssdb = simstring.writer(ssdbfn) for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND): # encode as UTF-8 for simstring s = row[0].encode('utf-8') ssdb.insert(s) simstring_count += 1 ssdb.close() except: print >> sys.stderr, "Error building simstring DB" raise if arg.verbose: print >> sys.stderr, "done." cursor.close()
import simstring import os # create the databases for Simstring to use for dicitonary matching during preprocessing # create name database db = simstring.writer('dicts' + os.sep + 'people.db') with open('dicts' + os.sep + 'chinese_only.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'english_only.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'frequent_last_names.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'shared.txt') as f: for word in f: db.insert(word.strip()) db.close() # create place database db = simstring.writer('dicts' + os.sep + 'places.db') with open('dicts' + os.sep + 'city_full.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'country_full.txt') as f: for word in f: db.insert(word.strip()) with open('dicts' + os.sep + 'region_full.txt') as f: for word in f:
#!/usr/bin/env python # -*- coding:utf-8 -*- """ A Unicode sample. We assume that the source code is written in UTF-8 encoding (see the encoding declaration in line 2). We can use 8-bit strings as they are with SimString. """ import simstring # Open a SimString database for writing with Unicode mode. db = simstring.writer('sample_unicode.db', 3, False, True) # Write a string, and close the database. db.insert('スパゲティ') db.close() # Open the SimString database for reading. db = simstring.reader('sample_unicode.db') # Set a similarity measure and threshold. db.measure = simstring.cosine db.threshold = 0.6 # Use an 8-bit string encoded in UTF-8. print(' '.join(db.retrieve('スパゲティー'))) # Convert a Unicode object into an UTF-8 query string. print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))
#!/usr/bin/env python import simstring # Create a SimString database with two person names. db = simstring.writer('sample.db') db.insert('Barack Hussein Obama II') db.insert('James Gordon Brown') db.close() # Open the database for reading. db = simstring.reader('sample.db') # Use cosine similarity and threshold 0.6. db.measure = simstring.cosine db.threshold = 0.6 print(db.retrieve('Barack Obama')) # OK. print(db.retrieve('Gordon Brown')) # OK. print(db.retrieve('Obama')) # Too dissimilar! # Use overlap coefficient and threshold 1.0. db.measure = simstring.overlap db.threshold = 1. print(db.retrieve('Obama')) # OK.