def init():
    global orgDB, symbDBs, prefixDB, suffixDB, prefixCounts, suffixCounts
    orgDB = simstring.reader("data/all_species_db")
    orgDB.measure = simstring.overlap
    orgDB.threshold = 1

    symbDBs = []

    for dbName in ("default_symbol", "official_symbol", "official_name",
                   "synonym", "homologene_id", "ensemblcluster_id",
                   "ensembl_id", "locustag", "cd_antigen", "description",
                   "locusname", "alergen"):
        dbFile = "data/all_symbols_%s_db" % dbName
        try:
            simDB = simstring.reader(dbFile)
        except IOError:
            continue
        simDB.measure = simstring.overlap
        simDB.threshold = 1
        symbDBs.append((dbName, simDB))

    prefixDB = simstring.reader("data/all_prefixes_db")
    prefixDB.measure = simstring.overlap
    prefixDB.threshold = 1

    suffixDB = simstring.reader("data/all_suffixes_rev_db")
    suffixDB.measure = simstring.overlap
    suffixDB.threshold = 1

    prefixCounts = readAffixFile("src_data/prefixes2.txt", False)
    suffixCounts = readAffixFile("src_data/suffixes2.txt", True)

    print >> sys.stderr, "INIT DONE"
    return symbDBs, orgDB
Пример #2
0
def match_dictionaries(doc,
                       place_set,
                       department_set,
                       university_set,
                       person_set,
                       matching='approx'):
    words = doc.words()

    if matching == 'approx':
        import simstring
        # load simstring dbs
        place_db = simstring.reader('dicts' + os.sep + 'places.db')
        department_db = simstring.reader('dicts' + os.sep + 'departments.db')
        university_db = simstring.reader('dicts' + os.sep + 'universities.db')
        person_db = simstring.reader('dicts' + os.sep + 'people.db')
        dbs = [place_db, department_db, university_db, person_db]

        for db in dbs:
            db.measure = simstring.cosine
            db.threshold = 0.9

    # check all unigrams
    for word in words:
        tok = word.text.lower().strip()
        if matching == 'exact':
            if tok in place_set:
                word.place_score = 1
            else:
                word.place_score = 0
            if tok in department_set:
                word.department_score = 1
            else:
                word.department_score = 0
            if tok in university_set:
                word.university_score = 1
            else:
                word.university_score = 0
            if tok in person_set:
                word.person_score = 1
            else:
                word.person_score = 0
        elif matching == 'approx':
            # print(tok)
            # print(type(tok))
            tok = tok.encode('ascii', 'ignore')
            # print(type(tok))
            sys.stdout.flush()
            word.place_score = 0
            word.place_score = 1 if len(
                place_db.retrieve(tok)) > 0 else word.place_score
            word.department_score = 0
            word.department_score = 1 if len(
                department_db.retrieve(tok)) > 0 else word.department_score
            word.university_score = 0
            word.university_score = 1 if len(
                university_db.retrieve(tok)) > 0 else word.university_score
            word.person_score = 1 if len(person_db.retrieve(tok)) > 0 else 0
    def __init__(self,
                 dbfn,
                 ngram_length=SimstringBase.DEFAULT_NGRAM_LENGTH,
                 include_marks=SimstringBase.DEFAULT_INCLUDE_MARKS,
                 threshold=SimstringBase.DEFAULT_THRESHOLD,
                 similarity_measure=SimstringBase.DEFAULT_SIMILARITY_MEASURE,
                 unicode=SimstringBase.DEFAULT_UNICODE,
                 build=False):

        assert include_marks == False, "Error: begin/end marks not supported"
        assert ngram_length == 3, "Error: unsupported n-gram length"

        super().__init__(dbfn,
                         ngram_length=ngram_length,
                         include_marks=include_marks,
                         threshold=threshold,
                         similarity_measure=similarity_measure,
                         unicode=unicode,
                         build=build)

        if build:
            self.db = simstring.writer(self.dbfn)
        else:
            self.db = simstring.reader(self.dbfn)

        self.db.measure = SIMILARITY_MEASURES[similarity_measure]
        self.db.threshold = threshold
def check_dictionary_db(token, database):
    dict_db = simstring.reader(database)
    dict_db.measure = simstring.cosine
    dict_db.threshold = 0.9
    if len(dict_db.retrieve(token.encode('utf-8'))) > 0:
        return 1
    return 0
Пример #5
0
 def __init__(self, db_file, measure=COSINE, threshold=.8):
     self.db = simstring.reader(db_file)
     if measure == COSINE:
         self.db.measure = simstring.cosine
     else:
         self.db.measure = simstring.overlap
     self.db.threshold = threshold
def modify_match_dis_normform(data_folder,
                              annoDict,
                              db_file,
                              len_text,
                              min_threshold=0.01):
    print data_folder + "simstring/" + db_file
    db = simstring.reader(data_folder + "simstring/" + db_file)
    db.measure = simstring.cosine
    predict_dict = dict()
    for doc_id, items in annoDict.iteritems():
        j = 0
        for item in items:
            db.threshold = 1.0
            mention = getNormform_space(item[2])
            match_concept = db.retrieve(mention)
            while match_concept == () and db.threshold > min_threshold:
                db.threshold = db.threshold - 0.01
                match_concept += db.retrieve(mention)
            if len(match_concept) == 1 and len(mention) > len_text:
                predict_dict.setdefault(doc_id, [])
                for concept in set(match_concept):
                    predict_dict[doc_id].append(
                        [item[0], item[1], concept, '9606', db.threshold])
                j += 1
    return predict_dict
Пример #7
0
def cell_match_simstring(ss_folder, annoDict, ab3p_dict, ms, mapping_dict, db_file, normform, min_threshold=0.01):
    distance_matrix = {0: "exact", 1: "dice", 2: "cosine", 3: "jaccard", 4: "overlap"}
    db = simstring.reader(ss_folder + db_file)
    db.measure = ms
    predict_dict = dict()
    for doc_id, items in annoDict.iteritems():
        predict_dict.setdefault(doc_id, [])
        for j, item in enumerate(items):
            predict_dict[doc_id].append([])
            db.threshold = 1.0
            try:
                cell = ab3p_dict[doc_id][item[2]]
            except:
                cell = item[2]
            if normform:
                mention = getNormform(cell)
            else:
                mention = cell
            match_concept = db.retrieve(mention)            
            while match_concept == () and db.threshold > min_threshold:
                db.threshold = db.threshold - 0.01
                match_concept = db.retrieve(mention)
            if len(match_concept) == 0:
                predict_dict[doc_id][j].append([item[0], item[1], item[2], set([]), 0])
            else:
                for k, concept in enumerate(list(set(match_concept))):
                    predict_dict[doc_id][j].append([item[0], item[1], concept, set([]), db.threshold])
                    for concept_id in mapping_dict[concept]:
                        predict_dict[doc_id][j][k][3].add(concept_id)
    return predict_dict
Пример #8
0
def open_simstring_db(dbname):
    try:
        db = simstring.reader(dbname)
    except:
        print >> sys.stderr, "Error opening simstring DBs for reading"
        raise        
    return db
Пример #9
0
    def __init__(self, path, similarity_name, threshold):
        if not (os.path.exists(path)) or not (os.path.isdir(path)):
            err_msg = (
                '"{}" does not exists or it is not a directory.').format(path)
            raise IOError(err_msg)

        self.db = simstring.reader(
            prepare_string_for_db_input(
                os.path.join(path, 'umls-terms.simstring')))
        self.db.measure = getattr(simstring, similarity_name)
        self.db.threshold = threshold
Пример #10
0
 def __init__(self, words, measure=3, n=3, be=True, unicode=True, file="sample.db"):
     self.n = n
     subprocess.check_output("mkdir -p db", shell=True)
     db = simstring.writer(f'./db/{file}', n, be, unicode)
     db.measure = measure
     for w in words:
         db.insert(w)
     db.close()
     db = simstring.reader(f"./db/{file}")
     db.measure = measure
     self.db = db
Пример #11
0
def test_matches():
    """
    Just tests reading from the databases.
    """
    dbs = ['people.db', 'places.db', 'departments.db', 'universities.db']
    for dbname in dbs:
        db = simstring.reader(os.path.join(fe_settings.simstringdb_dir,
                                           dbname))
        db.measure = simstring.cosine
        db.threshold = 0.6
        print(db.retrieve(u'london'.encode('utf-8')))
Пример #12
0
def ssdb_open(dbname):
    '''
    Given a DB name, opens it as a simstring DB and returns the handle.
    The caller is responsible for invoking close() on the handle.
    '''
    __import_simstring()

    try:
        return simstring.reader(__ssdb_filename(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise # TODO: raise specific exception
Пример #13
0
def score_string(word_text, dbname):
    """
    Impliments the simstring matching. 
    Assumes the presence of the simstring databases.
    """
    db = simstring.reader(os.path.join(fes.simstringdb_dir, dbname))
    db.measure = simstring.cosine
    db.threshold = 0.6
    if db.retrieve(word_text.encode('utf-8')):
        return 1
    else:
        return 0
Пример #14
0
def ssdb_open(dbname):
    '''
    Given a DB name, opens it as a simstring DB and returns the handle.
    The caller is responsible for invoking close() on the handle.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    try:
        return simstring.reader(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)
Пример #15
0
def ssdb_open(dbname):
    """Given a DB name, opens it as a simstring DB and returns the handle.

    The caller is responsible for invoking close() on the handle.
    """
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    try:
        return simstring.reader(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)
Пример #16
0
def ssdb_open(dbname):
    """Given a DB name, opens it as a simstring DB and returns the handle.

    The caller is responsible for invoking close() on the handle.
    """
    __import_simstring()

    try:
        if SIMSTRING_BINARY:
            return simstring.reader(__ssdb_path(dbname))
        else:
            fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH)
            db = SQLite3Database(fx)
            return db.use(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)
Пример #17
0
def normalize_org(org_dict, abbr_dict):
    print 'load organism mapping files ...'
    data_folder = os.getcwd() + '/'
    ncbi_symbol_dict, id_rank_dict, tax_tree, all_under_spp, lower_rank_map, all_rank_count, model_org = load_normalize_org_mapping(data_folder)
    min_threshold = 0.6
    ss_folder = data_folder + "simstring/"
    ncbi_db = 'new_all_org/new_all_org.db'
    db = simstring.reader(ss_folder + ncbi_db)
    db.measure = simstring.cosine
    pred_tests = match_org_simstring(ss_folder, org_dict, ncbi_symbol_dict, db, min_threshold, abbr_dict, all_under_spp)
    combined_tests = combine_prediction(ss_folder, org_dict, pred_tests, id_rank_dict, tax_tree, ncbi_symbol_dict, all_rank_count, model_org, all_under_spp, lower_rank_map, abbr_dict)
    final_dict = {}
    temp_dict = {}
    for k, v in combined_tests.iteritems():
        for anno in v:
            temp_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], anno[3], anno[5]])
            if anno[3] == '':
                final_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], 'org', 'organism:' + anno[2]])
            else:
                final_dict.setdefault(k, []).append([anno[0], anno[1], anno[2], 'org', 'NCBI taxon:' + anno[3]])
    return temp_dict, final_dict, all_rank_count # combined_tests
Пример #18
0
 def __init__(self,
              directory,
              filename,
              measure=simstring.overlap,
              threshold=0.65,
              mode='write'):
     if not (filename.endswith('.db') and os.path.isdir(directory)):
         raise ValueError(
             "Incorrect file format for Database. Database must end with .db"
         )
     else:
         self.writer = None
         self.reader = None
         if mode == 'write':
             self.writer = simstring.writer(
                 os.path.join(directory, filename))
         else:
             self.reader = simstring.reader(
                 os.path.join(directory, filename))
             self.reader.measure = measure
             self.reader.threshold = threshold
def chem_match_simstring(ss_folder, annoDict, db_file, umls_chemical_symbol):

    db = simstring.reader(ss_folder + db_file)
    db.measure = simstring.cosine
    pred_dict = copy.deepcopy(annoDict)
    found_dict = dict()
    for doc_id, items in annoDict.iteritems():
        for j, item in enumerate(items):
            pred_dict[doc_id][j].append([])
            pred_dict[doc_id][j].append([])
            try:
                pred_dict[doc_id][j][5:8] = found_dict[item[2]]
            except:
                for entity in item[4]:
                    db.threshold = 1.0
                    mention = getNormform(entity)
                    if len(mention) < 5:
                        threshold = 1.0
                    elif len(mention) in range(5, 11):
                        threshold = 0.8
                    else:
                        threshold = 0.6
                    match_concept = db.retrieve(mention)
                    while match_concept == () and db.threshold > threshold:
                        db.threshold = db.threshold - 0.01
                        match_concept = db.retrieve(mention)
                    if len(match_concept) != 0:
                        for concept in match_concept:
                            if concept not in pred_dict[doc_id][j][5]:
                                pred_dict[doc_id][j][5].append(concept)
                                pred_dict[doc_id][j][7].append(db.threshold)
                                # concepts = socket_check_value(sock, 'umls_chemical_symbol', concept)
                                concepts = umls_chemical_symbol[concept]
                                for concept_id in concepts:
                                    pred_dict[doc_id][j][6].append(concept_id)
                found_dict.setdefault(item[2], pred_dict[doc_id][j][5:8])
    return pred_dict
Пример #20
0
A Unicode sample.

We assume that the source code is written in UTF-8 encoding (see the
encoding declaration in line 2). We can use 8-bit strings as they are
with SimString.
"""

import simstring

# Open a SimString database for writing with Unicode mode.
db = simstring.writer('sample_unicode.db', 3, False, True)

# Write a string, and close the database.
db.insert('スパゲティ')
db.close()


# Open the SimString database for reading.
db = simstring.reader('sample_unicode.db')

# Set a similarity measure and threshold.
db.measure = simstring.cosine
db.threshold = 0.6

# Use an 8-bit string encoded in UTF-8.
print(' '.join(db.retrieve('スパゲティー')))

# Convert a Unicode object into an UTF-8 query string.
print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))

Пример #21
0
# -*- coding:utf-8 -*-
"""
A Unicode sample.

We assume that the source code is written in UTF-8 encoding (see the
encoding declaration in line 2). We can use 8-bit strings as they are
with SimString.
"""

import simstring

# Open a SimString database for writing with Unicode mode.
db = simstring.writer('sample_unicode.db', 3, False, True)

# Write a string, and close the database.
db.insert('スパゲティ')
db.close()

# Open the SimString database for reading.
db = simstring.reader('sample_unicode.db')

# Set a similarity measure and threshold.
db.measure = simstring.cosine
db.threshold = 0.6

# Use an 8-bit string encoded in UTF-8.
print(' '.join(db.retrieve('スパゲティー')))

# Convert a Unicode object into an UTF-8 query string.
print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))
Пример #22
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#
# Copyright 2014 Martin J Chorley
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import simstring

db = simstring.reader('names.db')
db.measure = simstring.cosine
db.threshold = 0.6

print ', '.join(db.retrieve('Ocho Lounge'))

print('\n\n')

db.measure = simstring.jaccard
db.threshold = 0.6

print ', '.join(db.retrieve('Ocho Lounge'))
Пример #23
0
    for word in f:
        db.insert(word.strip())
db.close()

# create university database
db = simstring.writer('dicts' + os.sep + 'universities.db')
with open('dicts' + os.sep + 'university_full.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'university_keywords.txt') as f:
    for word in f:
        db.insert(word.strip())
db.close()


db = simstring.reader('dicts' + os.sep + 'people.db')
print("testing person database")
print(db.retrieve('aaron'))
print(db.retrieve('abe'))

db = simstring.reader('dicts' + os.sep + 'places.db')
print("testing place database")
print(db.retrieve('boston'))
print(db.retrieve('china'))

db = simstring.reader('dicts' + os.sep + 'departments.db')
print("testing department database")
print(db.retrieve('medical'))
print(db.retrieve('association'))

db = simstring.reader('dicts' + os.sep + 'universities.db')
Пример #24
0
#!/usr/bin/env python

import simstring

# Create a SimString database with two person names.
db = simstring.writer('sample.db')
db.insert('Barack Hussein Obama II')
db.insert('James Gordon Brown')
db.close()


# Open the database for reading.
db = simstring.reader('sample.db')

# Use cosine similarity and threshold 0.6.
db.measure = simstring.cosine
db.threshold = 0.6
print(db.retrieve('Barack Obama'))      # OK.
print(db.retrieve('Gordon Brown'))      # OK.
print(db.retrieve('Obama'))             # Too dissimilar!

# Use overlap coefficient and threshold 1.0.
db.measure = simstring.overlap
db.threshold = 1.
print(db.retrieve('Obama'))             # OK.
Пример #25
0
out8=codecs.getwriter("utf-8")(sys.stdout)

def match(pwd,vocabdb):
    pwd_lower=pwd.lower()
    matches=[unicode(m,"utf-8") for m in vocabdb.retrieve(pwd_lower.encode("utf-8"))]
    #Quick first try - only use exact hits
    matches=list(set(matches))
    matches=[m for m in matches if m in pwd_lower]
    print >> out8, pwd+u"\t"+json.dumps(matches)

if __name__=="__main__":
    parser=argparse.ArgumentParser(description='Look up using a ready-made web-crawled vocabulary. Pipe the passwords in on stdin.')
    parser.add_argument('--db', default="simdb/pb34_wf_exc.simdb", help='SimString DB built using index_vocab.sh Default: %(default)s')
    args=parser.parse_args()
    vocabdb=simstring.reader(args.db)
    vocabdb.measure=simstring.overlap
    vocabdb.threshold=1.0

    decode_errors=0
    for counter,pwd in enumerate(sys.stdin):
        pwd=pwd.strip()
        if not pwd:
            continue
        try: #there's some broken utf there?
            pwd_u=unicode(pwd,"utf-8")
        except UnicodeDecodeError:
            decode_errors+=1
            continue
        matches=match(pwd_u,vocabdb)
Пример #26
0
def read_simstring():
    dbpath='simstring_law/law.db'
    db=simstring.reader(dbpath)
    db.measure = simstring.cosine
    db.threshold=0.9
    return db