Python CharacterNgramFeatureExtractor примеры, simstring.feature_extractor.character_ngram.CharacterNgramFeatureExtractor Python примеры использования

Пример #1

0

Показать файл

Файл: high_recall_matcher.py Проект: yonatanbitton/mdtel

def get_umls_data():
    umls_data = pd.read_csv(umls_df_data_path)
    print(f"Got UMLS data at length {len(umls_data)}")

    acronyms_umls_df = pd.read_csv(acronyms_dir + os.sep + 'acronyms_terms.csv')
    umls_data = pd.concat([umls_data, acronyms_umls_df])

    cuiless_umls_df = pd.read_csv(cuiless_dir + os.sep + 'cuiless_terms.csv')
    umls_data = pd.concat([umls_data, cuiless_umls_df])

    umls_data.reset_index(inplace=True)

    heb_umls_list = list(umls_data['HEB'].values)
    eng_umls_list = list(umls_data[STRING_COLUMN].values)

    heb_db = DictDatabase(CharacterNgramFeatureExtractor(2))
    eng_db = DictDatabase(CharacterNgramFeatureExtractor(2))

    for heb_w in heb_umls_list:
        heb_db.add(heb_w)

    for eng_w in eng_umls_list:
        lower_eng_w = eng_w.lower()
        eng_db.add(lower_eng_w)

    return heb_db, eng_db, umls_data

Пример #2

0

Показать файл

 def test_features(self):
     self.assertEqual(CharacterNgramFeatureExtractor().features('abcde'),
                      ['$a_1', 'ab_1', 'bc_1', 'cd_1', 'de_1', 'e$_1'])
     self.assertEqual(
         CharacterNgramFeatureExtractor(3).features('abcde'),
         ['$$a_1', '$ab_1', 'abc_1', 'bcd_1', 'cde_1', 'de$_1', 'e$$_1'])
     self.assertEqual(
         CharacterNgramFeatureExtractor().features(u'あいうえお'),
         ['$あ_1', 'あい_1', 'いう_1', 'うえ_1', 'えお_1', 'お$_1'])  # Japanese

Пример #3

0

Показать файл

Файл: matchers.py Проект: jorgeperezrojas/GES_utils

 def __init__(self, n_chars, n_words, special_words):
     self.n_chars = n_chars
     self.special_words = special_words
     self.__char_feature_extractor = CharacterNgramFeatureExtractor(n_chars)
     if type(n_words) != list:
         self.__word_feature_extractors = [WordNgramFeatureExtractor(n_words)]
     else:
         self.__word_feature_extractors = [
             WordNgramFeatureExtractor(n)
             for n in n_words
         ]

Пример #4

0

Показать файл

Файл: company_names.py Проект: icfly2/simstring-fast

def output_similar_strings_of_each_line(path, measure):
    strings = []
    with open(path, "r") as lines:
        for line in lines:
            strings.append(line.rstrip("\r\n"))

    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for string in strings:
        db.add(string)

    db.save("companies.db")

    dbl = DictDatabase.load("companies.db")

    searcher = Searcher(dbl, measure)
    profiler.start()

    for string in strings:
        result = searcher.search(string, 0.8)
        # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)]
        # print("\t".join([string, ",".join(result)]))

    profiler.stop()

    profiler.print()
    profiler.open_in_browser()

Пример #5

0

Показать файл

def ssdb_build(strs,
               dbname,
               ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    """Given a list of strings, a DB name, and simstring options, builds a
    simstring DB for the strings."""
    __import_simstring()
    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert include_marks == False, "Error: begin/end marks not supported"
        if SIMSTRING_BINARY:
            assert ngram_length == 3, "Error: unsupported n-gram length"
            db = simstring.writer(dbfn)
            for s in strs:
                db.insert(s)
            db.close()
        else:
            fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH)
            db = SQLite3Database(fx)
            db.use(dbfn)
            for s in strs:
                db.add(s)

    except BaseException:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn

Пример #6

0

Показать файл

def create_umls_ss_db(umls_kb, char_ngram_len=3, n_max_tokens=5):

    logging.info('Loading scispacy ...')
    import spacy
    self.sci_nlp = spacy.load('en_core_sci_md',
                              disable=['tagger', 'parser', 'ner'])

    simstring_db = DictDatabase(CharacterNgramFeatureExtractor(char_ngram_len))

    # preprocessing aliases and labels
    logging.info('Preprocessing aliases ...')
    alias_mapping = defaultdict(set)

    aliases = []
    for cui in umls_kb.get_all_cuis():

        cui_aliases = set(
            [a.lower() for a in umls_kb.get_aliases(cui, include_name=True)])

        for alias in cui_aliases:

            alias_chars = set(alias)
            if len(alias_chars.intersection(fb_punctuation)) > 0:
                continue

            elif alias in en_stopwords:
                continue

            elif alias.isnumeric():
                continue

            alias_doc = self.sci_nlp(
                alias)  # use same tokenizer as when splitting medmentions
            if len(alias_doc
                   ) > n_max_tokens:  # gets too big without restrictions
                continue

            alias_mapping[alias].add(cui)
            aliases.append(alias)

    logging.info('Adding to DB ...')
    for alias_idx, alias in enumerate(aliases):
        simstring_db.add(alias)
        if alias_idx % 1000000 == 0:
            logging.info('At %d/%d ...' % (alias_idx, len(aliases)))

    # setting paths
    db_path = '%s.aliases.%dgram.%dtoks.db' % (umls_kb.umls_version,
                                               char_ngram_len, n_max_tokens)
    map_path = '%s.aliases.%dtoks.map' % (umls_kb.umls_version, n_max_tokens)

    logging.info('Storing DB ...')
    with open(db_path, 'wb') as f:
        pickle.dump(simstring_db, f)

    logging.info('Storing Alias Mapping ...')
    with open(map_path, 'wb') as f:
        alias_mapping = dict(alias_mapping)
        pickle.dump(alias_mapping, f)

Пример #7

0

Показать файл

Файл: oov.py Проект: yagays/oov_magnitude_ja

    def __init__(self, word2vec):
        self.w2v = word2vec
        self.embedding_dim = self.w2v.vector_size
        self.vocab = set(self.w2v.vocab.keys())

        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for vocab_word in self.vocab:
            self.db.add(vocab_word)

Пример #8

0

Показать файл

Файл: test_searcher.py Проект: icfly2/simstring-fast

 def setUp(self) -> None:
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     db.add("foo")
     db.add("bar")
     db.add("fooo")
     db.add("food")
     db.add("fool")
     db.add("follow")
     self.searcher = Searcher(db, JaccardMeasure())

Пример #9

0

Показать файл

Файл: matchers.py Проект: jorgeperezrojas/GES_utils

class GESSyntacticFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, n_chars, n_words, special_words):
        self.n_chars = n_chars
        self.special_words = special_words
        self.__char_feature_extractor = CharacterNgramFeatureExtractor(n_chars)
        if type(n_words) != list:
            self.__word_feature_extractors = [WordNgramFeatureExtractor(n_words)]
        else:
            self.__word_feature_extractors = [
                WordNgramFeatureExtractor(n)
                for n in n_words
            ]

    def features(self, string):
        # lower y unicode
        normalized_string = string.lower()
        normalized_string = unidecode(normalized_string)
        
        # saca símbolos de puntuación
        normalized_string = re.sub(r'[,;()/+ -]+',' ',normalized_string)
        
        # elimina la información que no es necesaria (correspondiente a la edad)
        # 'en personas de'
        normalized_string = re.sub(r'en +personas +(de|desde)?',' ', normalized_string)
        # 'mayores de XX anos y mas'
        normalized_string = re.sub(r'(mayor(es)?( *de)?)? *\d+ +anos?( +y +mas)?', ' ', normalized_string)
        # 'desde XX anos y menores de XX anos' 'menores de XX anos'
        normalized_string = re.sub(r'(desde +\d+ +anos? +y)? *menores +de +\d+ +anos?', ' ', normalized_string)
        # 'de XX a XX anos'
        normalized_string = re.sub(r'de +\d+ a +\d+ anos?', ' ', normalized_string)
        
        # saca los ' y '
        normalized_string = re.sub(' y ', ' ', normalized_string)
        
        # saca los espacios de mas
        normalized_string = re.sub(' +', ' ', normalized_string)
        normalized_string = normalized_string.strip()
        
        # obtiene las características de caracteres
        char_features = self.__char_feature_extractor.features(normalized_string)
        
        # obtiene las características de palabras
        word_features = []
        for extractor in self.__word_feature_extractors:
            word_features += extractor.features(normalized_string)
            
        # obtiene características de palabras especiales
        special_features = []
        for word in self.special_words:
            re_word = f'(^{word})|( +{word} +)|({word}$)'
            if re.search(re_word, normalized_string):
                special_features.append(word)
        
        return char_features + word_features + special_features

Пример #10

0

Показать файл

Файл: txt_to_pickle.py Проект: swneurosci/ExECT-V2

def simstring_database(umls, nchar_val):
    db = DictDatabase(CharacterNgramFeatureExtractor(nchar_val))
    term_to_cui = dict()

    for value in umls:
        try:
            data = value.split('\t')
            cui = data[0]
            term = data[1].lower()
            db.add(term)
            term_to_cui[term] = cui
        except:
            continue

    pickle.dump(db, open('db.pickle', 'wb'))
    pickle.dump(term_to_cui, open('term_to_cui.pickle', 'wb'))

Пример #11

0

Показать файл

Файл: company_names.py Проект: shihuaxing/simstring

def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [
                str(round(x[0], 5)) + ' ' + x[1]
                for x in searcher.ranked_search(strings, 0.8)
            ]
            print("\t".join([strings, ",".join(result)]))

Пример #12

0

Показать файл

def ssdb_open(dbname):
    """Given a DB name, opens it as a simstring DB and returns the handle.

    The caller is responsible for invoking close() on the handle.
    """
    __import_simstring()

    try:
        if SIMSTRING_BINARY:
            return simstring.reader(__ssdb_path(dbname))
        else:
            fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH)
            db = SQLite3Database(fx)
            return db.use(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)

Пример #13

0

Показать файл

def construct_ontology(ontology_data):
    '''
    Create an n-char simstring database and
    term-to-code mapping to enable rapid ontology
    querying
    '''
    database = DictDatabase(CharacterNgramFeatureExtractor(2))

    term_to_cui = {}
    for entry in ontology_data:
        entry_values = entry.split('\t')
        if len(entry_values) == 2:
            term = clean_selected_term(entry_values[1])
            term_to_cui[term] = entry_values[0].strip()

    for term in term_to_cui.keys():
        term = clean_selected_term(term)
        database.add(term)

    return database, term_to_cui

Пример #14

0

Показать файл

def output_similar_strings_of_each_line(path, Database):
    number_of_lines = len(open(path).readlines())

    with Benchmarker(width=20) as bench:
        db = Database(CharacterNgramFeatureExtractor(2))
        @bench("initialize database({0} lines)".format(number_of_lines))
        def _(bm):
            with open(path, 'r') as lines:
                for line in lines:
                    strings = line.rstrip('\r\n')
                    db.add(strings)

        @bench("search text({0} times)".format(min(number_of_lines, SEARCH_COUNT_LIMIT)))
        def _(bm):
            searcher = Searcher(db, CosineMeasure())
            with open(path, 'r') as lines:
                for i, line in enumerate(lines):
                    if i >= SEARCH_COUNT_LIMIT:
                        break
                    strings = line.rstrip('\r\n')
                    result = searcher.search(strings, 0.8)

Пример #15

0

Показать файл

Файл: UMLS.py Проект: LasLitz/AML4DHGermanVecs

    def __init__(self,
                 from_dir: str = None,
                 json_path: str = "mapper.json",
                 umls_words: Iterable[str] = None):
        # self.db = DictDatabase(WordNgramFeatureExtractor(2))

        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))

        if from_dir:
            json_path = os.path.join(from_dir, json_path)
            if os.path.exists(json_path):
                print(f"initialize {self.__class__.__name__}... Load json")
                self.umls_dict, self.umls_reverse_dict = self.load_from_json(
                    json_path)
                self.add_words_to_db(self.umls_dict.keys())
            else:
                print(f"initialize {self.__class__.__name__}... Load dir")
                self.umls_dict, self.umls_reverse_dict = self.load_umls_dict(
                    from_dir)
                self.add_words_to_db(self.umls_dict.keys())
                self.save_as_json(path=json_path)
        else:
            self.add_words_to_db(umls_words)

Пример #16

0

Показать файл

Файл: test_mongo.py Проект: shihuaxing/simstring

 def setUp(self):
     self.db = MongoDatabase(CharacterNgramFeatureExtractor(2), database='simstring-test')
     self.db.reset_collection()
     for string in self.strings:
         self.db.add(string)

Пример #17

0

Показать файл

Файл: test_character_ngram.py Проект: shihuaxing/simstring

 def test_features(self):
     self.assertEqual(CharacterNgramFeatureExtractor().features('abcde'), [' a', 'ab', 'bc', 'cd', 'de', 'e '])
     self.assertEqual(CharacterNgramFeatureExtractor(3).features('abcde'), [' ab', 'abc', 'bcd', 'cde', 'de '])
     self.assertEqual(CharacterNgramFeatureExtractor(4).features('abcde'), [' abc', 'abcd', 'bcde', 'cde '])
     self.assertEqual(CharacterNgramFeatureExtractor(5).features('abcde'), [' abcd', 'abcde', 'bcde '])
     self.assertEqual(CharacterNgramFeatureExtractor().features(u'あいうえお'), [' あ', 'あい', 'いう', 'うえ', 'えお', 'お '])  # Japanese

Пример #18

0

Показать файл

 def setUp(self):
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         db.add(string)
     self.searcher = Searcher(db, CosineMeasure())

Пример #19

0

Показать файл

 def setUp(self):
     self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         self.db.add(string)

Пример #20

0

Показать файл

#############################
# Perform database cleaning #
#############################

# Read in branded foods CSV and clean it
df = pd.read_csv('branded_food.csv')
all_ingredients_final = get_cleaned_ingredients_list(df)
# Get a count for all the ingredients to be used by Peter Norvig Implementation
ingredients_count = Counter(all_ingredients_final)

##############################################
# Peter Norvig SimString Implementation Code #
##############################################

# Populate database with all ingredients
db = DictDatabase(CharacterNgramFeatureExtractor(2))
for ingredient in all_ingredients_final:
    db.add(ingredient)
# Create searcher object to be used by candidates function
searcher = Searcher(db, CosineMeasure())

# Functions


def probability(word, N=sum(ingredients_count.values())):
    """ 
  Returns the probability of the word appearing in the text 
  Usually, correctly spelled words will have a higher count and therefore probability than their mispellings
  """
    return ingredients_count[word] / N

Пример #21

0

Показать файл

Файл: function.py Проект: jphacks/NG_1803_2

def make_change_image_dict(drink_names):
    import re
    import json
    import difflib
    from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
    from simstring.measure.cosine import CosineMeasure
    from simstring.database.dict import DictDatabase
    from simstring.searcher import Searcher

    ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig")
    json_data2 = json.load(ff)
    ff.close()

    # 互いに類似度を比較する文字列のリスト
    STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names]
    TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2}
    TCD_name_db = list(TCD_db.keys())
    count = 0
    length = len(STR_db)
    result_dict = {}
    change_image_dict = {}

    
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for str1 in STR_db:
        db.add(str1)
    
    for str2 in TCD_name_db:
        result_dict[str2] = {}
        searcher = Searcher(db, CosineMeasure())
        i = 1.0
        # 類似度を計算、0.0~1.0 で結果が返る
        flag = False
        for str1 in STR_db:
            s = difflib.SequenceMatcher(None, str2, str1).ratio()
            if s > 0.75:
                flag = True
                if (str1 in result_dict[str2]):
                    
                    d =  result_dict[str2][str1]
                    #平均更新
                    d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1]
                    
                    result_dict[str2][str1] = d
                else:
                    
                    result_dict[str2].setdefault(str1, [s ,1])
                    
        
        temp = []
        while i >= 0.65:
            result = searcher.search(str2, i)
            if (len(result)):
                flag = True
                for str1 in result:
                    if (str1 in temp): continue
                    temp += [str1]
                    if (str1 in result_dict[str2]):
                        
                        d =  result_dict[str2][str1]
                        #平均更新
                        d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1]
                        
                        result_dict[str2][str1] = d
                    else:
                        result_dict[str2].setdefault(str1, [i ,1])
                        
                        
            i -= 0.001
        if (flag):
            
            count += 1
        
    with open("./search_log.txt", "w+", encoding="utf-8_sig") as f:
        real_count = 0
        for str2 in TCD_name_db:
            print("\n", file=f)
            print("\n")
            print(">> "+str2, file=f)
            print(">> "+str2)
            M = 0.0
            name = ""
            for key, value_list in result_dict[str2].items():
                if (M < value_list[0]):
                    name = key
                    M = value_list[0]
            print("  "+name+": "+str(M), file=f)
            if (M != 0):
                if (M >= 0.76):
                    print("  "+name+": "+str(M))
                    print("ok", file=f)
                    print("ok")
                    change_image_dict[name] = TCD_db[str2]
                    real_count += 1
                else:
                    print("  "+name+": "+str(M))
                    print("out", file=f)
                    print("out")
            

        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f)
        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length))

    exit()
    return change_image_dict

Пример #22

0

Показать файл

def main(argv):
    arg = argparser().parse_args(argv[1:])

    # only simstring library default supported at the moment (TODO)
    if SIMSTRING_BINARY:
        assert DEFAULT_NGRAM_LENGTH == 3, "Error: unsupported n-gram length"
        assert DEFAULT_INCLUDE_MARKS == False, "Error: begin/end marks not supported"

    infn = arg.file

    if arg.database is None:
        # default database file name
        bn = splitext(basename(infn))[0]
        sqldbfn = sqldb_filename(bn)
        ssdbfn = ssdb_filename(bn)
    else:
        sqldbfn = arg.database + '.' + SQL_DB_FILENAME_EXTENSION
        ssdbfn = arg.database + '.' + SS_DB_FILENAME_EXTENSION

    if arg.verbose:
        print("Storing SQL DB as %s and" % sqldbfn, file=sys.stderr)
        print("  simstring DB as %s" % ssdbfn, file=sys.stderr)
    start_time = datetime.now()

    import_count, duplicate_count, error_count, simstring_count = 0, 0, 0, 0

    with codecs.open(infn, 'rU', encoding=arg.encoding) as inf:

        # create SQL DB
        try:
            connection = sqlite.connect(sqldbfn)
        except sqlite.OperationalError as e:
            print("Error connecting to DB %s:" % sqldbfn, e, file=sys.stderr)
            return 1
        cursor = connection.cursor()

        # create SQL tables
        if arg.verbose:
            print("Creating tables ...", end=' ', file=sys.stderr)

        for command in CREATE_TABLE_COMMANDS:
            try:
                cursor.execute(command)
            except sqlite.OperationalError as e:
                print("Error creating %s:" % sqldbfn,
                      e,
                      "(DB exists?)",
                      file=sys.stderr)
                return 1

        # import data
        if arg.verbose:
            print("done.", file=sys.stderr)
            print("Importing data ...", end=' ', file=sys.stderr)

        next_eid = 1
        label_id = {}
        next_lid = 1
        next_pid = dict([(t, 1) for t in TYPE_VALUES])

        for i, l in enumerate(inf):
            l = l.rstrip('\n')

            # parse line into ID and TYPE:LABEL:STRING triples
            try:
                id_, rest = l.split('\t', 1)
            except ValueError:
                if error_count < MAX_ERROR_LINES:
                    print(
                        "Error: skipping line %d: expected tab-separated fields, got '%s'"
                        % (i + 1, l),
                        file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print(
                        "(Too many errors; suppressing further error messages)",
                        file=sys.stderr)
                error_count += 1
                continue

            # parse TYPE:LABEL:STRING triples
            try:
                triples = []
                for triple in rest.split('\t'):
                    type_, label, string = triple.split(':', 2)
                    if type_ not in TYPE_VALUES:
                        print("Unknown TYPE %s" % type_, file=sys.stderr)
                    triples.append((type_, label, string))
            except ValueError:
                if error_count < MAX_ERROR_LINES:
                    print(
                        "Error: skipping line %d: expected tab-separated TYPE:LABEL:STRING triples, got '%s'"
                        % (i + 1, rest),
                        file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print(
                        "(Too many errors; suppressing further error messages)",
                        file=sys.stderr)
                error_count += 1
                continue

            # insert entity
            eid = next_eid
            next_eid += 1
            try:
                cursor.execute("INSERT into entities VALUES (?, ?)",
                               (eid, id_))
            except sqlite.IntegrityError as e:
                if error_count < MAX_ERROR_LINES:
                    print("Error inserting %s (skipping): %s" % (id_, e),
                          file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print(
                        "(Too many errors; suppressing further error messages)",
                        file=sys.stderr)
                error_count += 1
                continue

            # insert new labels (if any)
            labels = set([l for t, l, s in triples])
            new_labels = [l for l in labels if l not in label_id]
            for label in new_labels:
                lid = next_lid
                next_lid += 1
                cursor.execute("INSERT into labels VALUES (?, ?)",
                               (lid, label))
                label_id[label] = lid

            # insert associated strings
            for type_, label, string in triples:
                table = TABLE_FOR_TYPE[type_]
                pid = next_pid[type_]
                next_pid[type_] += 1
                lid = label_id[label]  # TODO
                if TABLE_HAS_NORMVALUE[table]:
                    normstring = string_norm_form(string)
                    cursor.execute(
                        "INSERT into %s VALUES (?, ?, ?, ?, ?)" % table,
                        (pid, eid, lid, string, normstring))
                else:
                    cursor.execute(
                        "INSERT into %s VALUES (?, ?, ?, ?)" % table,
                        (pid, eid, lid, string))

            import_count += 1

            if arg.verbose and (i + 1) % 10000 == 0:
                print('.', end=' ', file=sys.stderr)

        if arg.verbose:
            print("done.", file=sys.stderr)

        # create SQL indices
        if arg.verbose:
            print("Creating indices ...", end=' ', file=sys.stderr)

        for command in CREATE_INDEX_COMMANDS:
            try:
                cursor.execute(command)
            except sqlite.OperationalError as e:
                print("Error creating index", e, file=sys.stderr)
                return 1

        if arg.verbose:
            print("done.", file=sys.stderr)

        # wrap up SQL table creation
        connection.commit()

        # create simstring DB
        if arg.verbose:
            print("Creating simstring DB ...", end=' ', file=sys.stderr)

        try:
            if SIMSTRING_BINARY:
                ssdb = simstring.writer(ssdbfn)

                for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND):
                    s = row[0]
                    ssdb.insert(s)
                    simstring_count += 1
                ssdb.close()

            else:
                sys.path.append(
                    join(dirname(abspath(__file__)), '..', 'server', 'src'))
                from simstring_pure_sqlite3 import SQLite3Database
                from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
                from simstring.searcher import Searcher

                fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH)
                ssdb = SQLite3Database(fx)
                ssdb.use(connection)

                for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND):
                    s = row[0]
                    ssdb.add(s)
                    simstring_count += 1

        except BaseException:
            print("Error building simstring DB", file=sys.stderr)
            raise

        if arg.verbose:
            print("done.", file=sys.stderr)

        cursor.close()

    # done
    delta = datetime.now() - start_time

    if arg.verbose:
        print(file=sys.stderr)
        print("Done in:",
              str(delta.seconds) + "." + str(delta.microseconds / 10000),
              "seconds",
              file=sys.stderr)

    print(
        "Done, imported %d entries (%d strings), skipped %d duplicate keys, skipped %d invalid lines"
        % (import_count, simstring_count, duplicate_count, error_count))

    return 0

Python CharacterNgramFeatureExtractor примеры использования