Пример #1
0
    def load_dawg_caches(self, correct_cache_fn, correction_indeces_fn,
                        corrections_list_fn):

        self.correct_cache = dawg.DAWG().load(correct_cache_fn)
        self.correction_indeces_cache = \
                dawg.IntDAWG().load(correction_indeces_fn)
        self.corrections_list = cPickle.load(open(corrections_list_fn))
Пример #2
0
def init():
    global surname_list, place_list, org_list, exclude_list
    global place_trie, org_trie, person_trie, custom_trie, ne_trie

    person_trie = load_ne_from_file(nr_file, nr_save, "PER")
    place_trie = load_ne_from_file(ns_file, ns_save, "LOC")
    org_trie = load_ne_from_file(nt_file, nt_save, "ORG")
    custom_trie = load_ne_from_file(custom_file, custom_save, "ALL")

    ne_list = {}
    get_ne(nr_file, "PER", ne_list)
    get_ne(ns_file, "LOC", ne_list)
    get_ne(nt_file, "ORG", ne_list)
    ne_trie = dawg.IntDAWG(zip(ne_list.keys(), ne_list.values()))
    if os.path.exists(surname_save):
        surname_list = cPickle.load(open(surname_save, "r"))
    else:
        f = open(surname_file, "r")
        for line in f.readlines():
            seg = line.strip().decode("utf8").split()
            if not seg:
                continue
            surname = seg[0]
            if len(surname) == 1:
                surname_list["single"].append(surname)
            else:
                surname_list["double"].append(surname)
        f.close()
        cPickle.dump(surname_list, open(surname_save, "w"))

    place_list = load_list_from_file(place_file, place_save)
    org_list = load_list_from_file(org_file, org_save)
    exclude_list = load_list_from_file(exclude_file, exclude_save)
Пример #3
0
    def train_unigram(corpus):
        """
        Обучение униграммной модели
        """
        freqs = defaultdict(int)  # Словарь частот
        # Чтение из файла корпуса
        with codecs.open(pl(corpus), "r", encoding="UTF8") as fin:
            for line in fin:
                line = line.strip()
                if line == u"<S>":
                    freqs[u"*START*"] += 1
                    continue
                if line == u"</S>":
                    freqs[u"*STOP*"] += 1
                    continue
                if line.strip() == "" or len(line.split("\t")) <= 2:
                    continue
                lem = line.split("\t")[1]
                if lem != "":
                    freqs[line.split("\t")[1]] += 1

        d = dawg.IntDAWG([(word, freq) for word, freq in freqs.iteritems()])
        d.save(pl(corpus + "_freqs_1.dawg"))  # Сериализация

        with open(pl(corpus + "_1_sum.pkl"),
                  "wb") as fout:  # Запоминаем количество слов в корпусе
            pickle.dump(sum(freqs.values()), fout, pickle.HIGHEST_PROTOCOL)
        print corpus, "unigram saved"
        return True
Пример #4
0
 def load_from_files(pickle_fn, dawg_fn, prefix_dawg_fn):
     entity_db = cPickle.load(open(pickle_fn, "rb"))
     entity_db.dawg = dawg.IntCompletionDAWG()
     entity_db.dawg.load(dawg_fn)
     entity_db.long_entities = dawg.IntDAWG()
     entity_db.long_entities.load(prefix_dawg_fn)
     return entity_db
Пример #5
0
    def train_collocations(corpus):
        """
        Сбор коллокаций по корпусу (по несмежным биграммам слов, не разделенных никакими знаками препинания, кроме тире).
        (Обучение коллокационной модели)
        """
        bigrams = unpkl_1layered_i(pl(corpus + ".bigrams.pkl"))
        unigrams = unpkl_1layered_i(pl(corpus + ".unigrams.pkl"))
        collocs = defaultdict(int)
        N = sum(unigrams.values())

        for bigram, freq in bigrams.iteritems():
            try:
                collocs[bigram] = MI(freq, unigrams[bigram[0]],
                                     unigrams[bigram[1]], N)
            except Exception:
                print freq, unigrams[bigram[0]], unigrams[bigram[1]], N
                continue

        # Тестовый вывод в файл
        with codecs.open(pl(corpus + ".collocs"), "w",
                         encoding="UTF8") as fout:
            fout.write("Collocate_1\tCollocate_2\tFreqs\tMI\n")
            for bigram, mi in sorted(collocs.iteritems(),
                                     key=lambda x: x[1],
                                     reverse=True):
                fout.write(u"{0}\t{1:d}\t{2:.4f}\n".format(
                    "\t".join(bigram), bigrams[bigram], mi))

        dcollocs = dawg.IntDAWG([("|".join(bigram), int(freq * 100))
                                 for bigram, freq in collocs.iteritems()])
        dcollocs.save(pl(corpus + ".collocs.dawg"))
        print "Collocations saved"
        return True
Пример #6
0
def build_test_data():

    dawg.CompletionDAWG(['f', 'bar', 'foo',
                         'foobar']).save('dev_data/small/completion.dawg')
    dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')

    bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'),
                  ('foobar', b'data4'))
    dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')

    record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)),
                   ('foo', (3, 2, 1)), ('foobar', (6, 3, 0)))
    dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')

    int_data = {'foo': 1, 'bar': 5, 'foobar': 3}
    dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg')
    dawg.IntCompletionDAWG(int_data).save(
        'dev_data/small/int_completion_dawg.dawg')

    dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
    dawg.RecordDAWG(str("=H"), [
        (k, (len(k), )) for k in TestPrediction.DATA
    ]).save('dev_data/small/prediction-record.dawg')

    create_dawg().save('dev_data/large/dawg.dawg')
    create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
    create_record_dawg().save('dev_data/large/record_dawg.dawg')
    create_int_dawg().save('dev_data/large/int_dawg.dawg')
Пример #7
0
def load_ne_from_file(fname, sname, tag):
    if os.path.exists(sname):
        temp_trie = cPickle.load(open(sname, "r"))
    else:
        temp_ne = get_ne(fname, tag)
        temp_trie = dawg.IntDAWG(zip(temp_ne.keys(), temp_ne.values()))
        cPickle.dump(temp_trie, open(sname, "w"))
    return temp_trie
Пример #8
0
 def finalize_long_entities(self):
     logging.info("Creating prefix dawg...")
     self.long_values = {}
     self.long_entities = dawg.IntDAWG(
         (p, self.long_values.setdefault(frozenset(full), 
                                         len(self.long_values)))
         for p, full in self.long_entities.iteritems())
     self.long_values = [k for k, _ in 
                         sorted(self.long_values.iteritems(),
                                key=lambda x: x[1])]
Пример #9
0
    def train_ngram(corpus, n):
        """
        Обучение N-граммной модели (2- или 3-)
        """
        if n == 3:
            freqs = defaultdict(lambda: defaultdict(lambda: defaultdict(int))
                                )  # Словарь частот
        elif n == 2:
            freqs = defaultdict(lambda: defaultdict(int))
        else:
            raise ValuError("Wrong n parameter for N-gram model training!")

        # Чтение из файла корпуса
        with codecs.open(pl(corpus), "r", encoding="UTF8") as fin:
            for line in fin:
                line = line.strip()
                if line == u"<S>":
                    buff = ["*START*"] * n
                    continue
                if line == u"</S>":  # Конец предложения
                    buff.append(u"*STOP*")
                    Synonymizer.add_counts(freqs, buff, n)
                if line == "" or len(line.split("\t")) <= 2:  # Знак препинания
                    continue
                # Накапливаем частоты лемм
                lem = line.split("\t")[1]
                if lem != "":
                    buff.append(lem)

        if n == 3:
            d = dawg.IntDAWG([("|".join((word1, word2, word3)), freq)
                              for word1, v1 in freqs.iteritems()
                              for word2, v2 in v1.iteritems()
                              for word3, freq in v2.iteritems()])
        elif n == 2:
            d = dawg.IntDAWG([("|".join((word1, word2)), freq)
                              for word1, v1 in freqs.iteritems()
                              for word2, freq in v1.iteritems()])

        d.save(pl(corpus + "_freqs_" + str(n) + ".dawg"))  # Сериализация
        print corpus, n, "gram saved"
        return True
Пример #10
0
def test_contains():
    d = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3})

    assert 'foo' in d
    assert 'bar' in d
    assert 'foobar' in d
    assert 'fo' not in d
    assert 'x' not in d

    assert b'foo' in d
    assert b'x' not in d
Пример #11
0
def buildDAWG(filename):

    fp = open(filename,'r')

    my_list = []
    for each in fp:
        line = each.strip().split(' ')
        print(line)
        my_list.append((line[0],int(line[1])))

    my_dawg = dawg.IntDAWG(my_list)

    with open(PINYINDAWG,'wb') as f:
        my_dawg.write(f)
Пример #12
0
def buildDAWG(filename, savename):
    fp = open(filename, 'r')

    result = json.load(fp)
    my_dict = {}
    for each in result:
        if "W" in each:
            my_dict.update(result[each])

    my_list = []
    index = 0
    for each in my_dict:
        my_list.append((each, my_dict[each]))
        index += 1
    my_list.append(("__total__", index))

    my_dawg = dawg.IntDAWG(my_list)

    with open(savename, 'wb') as f:
        my_dawg.write(f)
Пример #13
0
    def train_bayes(corpus, n):
        """
        Обучение контекстной модели для наивного байесовского классификатора
        """
        syns = {}
        with open(pl("total.pkl"),
                  "rb") as fin:  # Подгружаем словарь синонимов
            syns = pickle.load(fin)
        all_syns = set(syns.keys())
        all_syns = all_syns.union(
            set(itertools.chain.from_iterable(syns.values())))  # Все синонимы
        contexts = defaultdict(lambda: defaultdict(int))  # Словарь контекстов
        sentence = []
        # Чтение из файла корпуса и запись в выходной файл
        with codecs.open(pl(corpus), "r", encoding="UTF8") as fin:
            for line in fin:
                line = line.strip()
                if line == u"<S>":
                    sentence = [u"*START*"]
                    continue
                if line == u"</S>":
                    sentence.append(u"*STOP*")
                    Synonymizer.add_contexts(all_syns, contexts, sentence, n)
                    sentence = []
                if line == "" or len(line.split("\t")) <= 2:
                    continue
                sentence.append(line.split("\t")[1])

        d = dawg.IntDAWG([("|".join((word1, word2)), freq)
                          for word1, v1 in contexts.iteritems()
                          for word2, freq in v1.iteritems()])
        d.save(pl(corpus + "_contexts_" + str(n) +
                  ".dawg"))  # Сериализация словарей

        print corpus, "bayes saved"
        return True
Пример #14
0
    def get_freqs(self, freq_file_path):

        if self.freq_struct_is_dawg:
            self.freqs = dawg.IntDAWG().load(freq_file_path)
        else:
            self.freqs = cPickle.load(open(freq_file_path))
Пример #15
0
 def __init__(self, mode='one', debug='no'):
     if debug == 'no':
         logging.basicConfig()
     else:
         logging.basicConfig(level=logging.DEBUG)
     self.logger = logging.getLogger()
     self.logger.debug('Setting up the Accentor...')
     self.mode = mode
     self.__all_russian_letters = {
         'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л',
         'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш',
         'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'
     }
     self.__russian_vowels = {
         'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е'
     }
     self.__homonyms = None
     self.__simple_words_dawg = None
     self.__function_words = None
     self.__new_homonyms = {}
     self.__new_simple_words = set()
     self.__bad_words = []
     self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U)
     self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U)
     assert mode in ('one',
                     'many'), 'Set either "one" or "many" variant mode!'
     assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!'
     homograph_dictionary_name = os.path.join(os.path.dirname(__file__),
                                              'data', 'homographs.json')
     assert os.path.isfile(
         homograph_dictionary_name), 'File `{0}` does not exist!'.format(
             homograph_dictionary_name)
     simple_words_dawg_name = os.path.join(os.path.dirname(__file__),
                                           'data', 'simple_words.dawg')
     assert os.path.isfile(
         simple_words_dawg_name), 'File `{0}` does not exist!'.format(
             simple_words_dawg_name)
     function_words_name = os.path.join(os.path.dirname(__file__), 'data',
                                        'Function_words.json')
     assert os.path.isfile(
         function_words_name), 'File `{0}` does not exist!'.format(
             function_words_name)
     data = None
     try:
         d = dawg.IntDAWG()
         self.__simple_words_dawg = d.load(simple_words_dawg_name)
         ###
         with codecs.open(homograph_dictionary_name,
                          mode='r',
                          encoding='utf-8',
                          errors='ignore') as fp:
             data = json.load(fp)
         error_message_homographs = 'File `{0}` contains incorrect data!'.format(
             homograph_dictionary_name)
         assert isinstance(data, dict), error_message_homographs
         self.__homonyms = dict()
         for cur_wordform in data:
             assert self.check_source_wordform(cur_wordform), \
                 error_message_homographs + ' Word `{0}` is inadmissible!'.format(cur_wordform)
             assert (cur_wordform not in self.__homonyms) and (cur_wordform.lower() not in self.__simple_words_dawg), \
                 error_message_homographs + ' Word `{0}` is repeated!'.format(cur_wordform)
             assert isinstance(data[cur_wordform], dict), \
                 error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform)
             for cur_key in data[cur_wordform]:
                 assert self.check_morphotag(cur_key), \
                     error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform)
                 assert self.check_accented_wordform(data[cur_wordform][cur_key]), \
                     error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform)
             values = [data[cur_wordform][it] for it in data[cur_wordform]]
             self.__homonyms[cur_wordform] = copy.deepcopy(
                 data[cur_wordform])
         ###
         self.__function_words = None
         with codecs.open(function_words_name,
                          mode='r',
                          encoding='utf-8',
                          errors='ignore') as fp:
             function_words = json.load(fp)
         error_message_function_words = 'File `{0}` contains incorrect data!'.format(
             function_words_name)
         assert isinstance(function_words,
                           list), error_message_function_words
         assert isinstance(function_words[0],
                           str), error_message_function_words
         self.__function_words = function_words
     finally:
         if data is not None:
             del data
Пример #16
0
def init():
    global ne_list, surname_list, place_list, org_list, trie, place_trie, org_trie, person_trie
    global kw_list1, kw_list2, rule_list
    if os.path.exists(ne_save):
        ne_list = cPickle.load(open(ne_save, "r"))
    else:
        get_ne(ns_file, "ns")
        get_ne(nr_file, "nr")
        get_ne(nt_file, "nt")
        ne_list = collections.OrderedDict(sorted(ne_list.items()))
        # save NEs:
        cPickle.dump(ne_list, open(ne_save, "w"))

    print len(ne_list.keys())
    trie = dawg.IntDAWG(zip(ne_list.keys(), ne_list.values()))
    # trie = dawg.IntCompletionDAWG(zip(ne_list.keys(), ne_list.values()))
    place_ne = {k: v for k, v in ne_list.items() if v == 1}
    place_trie = dawg.IntDAWG(zip(place_ne.keys(), place_ne.values()))
    org_ne = {k: v for k, v in ne_list.items() if v == 2}
    org_trie = dawg.IntDAWG(zip(org_ne.keys(), org_ne.values()))
    person_ne = {k: v for k, v in ne_list.items() if v == 0}
    person_trie = dawg.IntDAWG(zip(person_ne.keys(), person_ne.values()))
    # load second-phase dicts.
    if os.path.exists(surname_save):
        surname_list = cPickle.load(open(surname_save, "r"))
    else:
        f = open(surname_file, "r")
        for line in f.readlines():
            seg = line.strip().decode("utf8").split()
            if not seg:
                continue
            surname = seg[0]
            if len(surname) == 1:
                surname_list["single"].append(surname)
            else:
                surname_list["double"].append(surname)
        f.close()
        cPickle.dump(surname_list, open(surname_save, "w"))

    if os.path.exists(place_save):
        place_list = cPickle.load(open(place_save, "r"))
        # print place_list
    else:
        f = open(place_file, "r")
        for line in f.readlines():
            seg = line.strip().decode("utf8").split()
            if not seg:
                continue
            place = seg[0]
            place_list.append(place)
        f.close()
        cPickle.dump(place_list, open(place_save, "w"))

    if os.path.exists(org_save):
        org_list = cPickle.load(open(org_save, "r"))
    else:
        f = open(org_file, "r")
        for line in f.readlines():
            seg = line.strip().decode("utf8").split()
            if not seg:
                continue
            org = seg[0]
            org_list.append(org)
        f.close()
        cPickle.dump(org_list, open(org_save, "w"))

    sw = open(stopword_file, "r")
    for line in sw.readlines():
        stopwords_list.append(line.strip())
    sw.close()

    kw = open(kw_file1, "r")
    for line in kw.readlines():
        kw_list1.append(line.strip().decode("utf8"))
    kw.close()

    kw = open(kw_file2, "r")
    for line in kw.readlines():
        kw_list2.append(line.strip().decode("utf8"))
    kw.close()

    return trie
Пример #17
0
 def __enter__(self):
     self.lookup = dawg.IntDAWG()
     self.lookup.load(self.path)
     return self
Пример #18
0
def loaddict(f_index='pyindex.dawg', f_essay='essay.dawg'):
    global essay, p_index
    p_index = dawg.BytesDAWG()
    p_index.load(f_index)
    essay = dawg.IntDAWG()
    essay.load(f_essay)
Пример #19
0
    def __init__(self, morph, params):
        """
        Инициализация синонимизатора
        """
        args = set(params)
        self.morph = morph  #  Морфология pymorphy
        self.UseIdioms = True if "-i" in args else False
        self.UseBayes = True if "-b" in args else False
        self.UseDisambig = True if "-disamb" in args else False
        self.UseNGram = True if "-n" in args else False
        self.UseDetails = True if "-d" in args else False
        self.UseOnlySyn = True if "-os" in args else False
        self.UseViterbi = True if "-v" in args else False
        self.UseCollocs = True if "-col" in args else False

        # Выводим параметры, с которыми запущен синонимизатор (если необходимо)
        if self.UseDetails:
            print "<font color='blue'>" + " ".join(params) + "</font><br><br>"

        # "Уровень" подбора синонимов: 1 - Точный, 2 - Средний, 3 - Низкий, с макс. синонимизацией
        Level = 3
        if "-level" in args:
            try:
                Level = int(sys.argv[sys.argv.index("-level") + 1])
            except Exception:
                Level = 3

        # Имя файла со словарем синонимов
        if "-dict" in args:
            try:
                synfile = sys.argv[sys.argv.index("-dict") + 1]
            except Exception:
                synfile = "base.pkl"
        else:
            raise IOError("Synonims dictionary not found!")

        # Пороги отсечения вероятностей в зависимости от "уровня" - для N-граммной модели
        SMALL_PROBS = {1: 0.5, 2: 0.2, 3: 0}
        # Максимальное допустимое число равноценных вариантов синонимов в зависимости от "уровня"
        VARS_COUNT = {1: 3, 2: 5, 3: 10}

        self.vars_count = VARS_COUNT[Level]
        self.small_prob = SMALL_PROBS[Level]

        (self.samplefile, corpusfile) = params[1:3]

        # Проверка параметров: метод синонимизации
        if sum(
                map(bool, (self.UseBayes, self.UseNGram, self.UseViterbi,
                           self.UseCollocs))) != 1:
            raise ValueError(
                "Choose only one of the three methods: Bayesian, Ngram or Viterbi!!!"
            )

        self.actions = {
            self.UseNGram: self.calc_ngram_sent,
            self.UseBayes: self.calc_bayes_sent,
            self.UseViterbi: self.calc_viterbi_sent,
            self.UseCollocs: self.calc_colloc_sent
        }

        # Подгружаем сериализованный словарь синонимов. Формат словаря: {лексема: множество лексем-синонимов}.
        self.syns = unpkl_1layered_sets(pl(synfile))

        # Подгружаем словарь идиом, если необходимо. Формат: множество.
        if self.UseIdioms:
            self.idioms = set()
            with codecs.open(pl("idioms.txt.lemma"), "r",
                             encoding="UTF8") as fin:
                for line in fin:
                    self.idioms.add(line.strip())

        # Подгружаем коллокации, если необходимо
        if self.UseCollocs:
            self.collocs = dawg.IntDAWG()
            self.collocs.load(pl(corpusfile + ".collocs.dawg"))
            self.posfreqs = unpkl_3layered_f(pl(corpusfile + ".pos.pkl"))

        # Подгружаем частоты униграмм и общее число слов в корпусе
        if not self.UseCollocs:
            self.freqs = dawg.IntDAWG()
            self.freqs.load(pl(corpusfile + "_freqs_1.dawg"))
            self.f_sum = 0
            with open(pl(corpusfile + "_1_sum.pkl"), "rb") as fin:
                self.f_sum = pickle.load(fin)

        # Подгружаем список контекстов, если необходимо
        if self.UseBayes:
            self.N = 5
            self.contexts = dawg.IntDAWG()
            self.contexts.load(pl(corpusfile + "_contexts_5.dawg"))

        # Подгружаем частоты n-грамм, если необходимо
        if self.UseNGram or self.UseViterbi:
            self.N = 3
            (self.freqs2, self.freqs3) = (dawg.IntDAWG(), dawg.IntDAWG())
            self.freqs2.load(pl(corpusfile + "_freqs_2.dawg"))
            self.freqs3.load(pl(corpusfile + "_freqs_3.dawg"))

        if self.UseCollocs:
            self.indexed_wsyns = defaultdict(
                str)  # Словарь типа {номер токена: синоним}
        else:
            self.indexed_syns = defaultdict(
                str)  # Словарь типа {номер токена: синоним}
Пример #20
0
Файл: speed.py Проект: yyht/DAWG
def create_int_dawg():
    words = words100k()
    values = [len(word) for word in words]
    return dawg.IntDAWG(zip(words, values))
Пример #21
0
    def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True):
        """Create an object for adding accents.

        Args:
            mode (str, optional): [description]. Defaults to 'one'.
            debug (str, optional): Print extra info useful for debugging. Defaults to 'no'.
            exception_for_unknown (bool, optional): Raise an exception in case of unknown word.
                Defaults to False.
            use_wiki (bool, optional): Look for a word in Wiktionary. Defaults to True.
        """
        if debug == 'no':
            logging.basicConfig()
        else:
            logging.basicConfig(level=logging.DEBUG)
        self.logger = logging.getLogger()
        self.logger.debug('Setting up the Accentor...')
        self.mode = mode
        self.__rus_letters = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
        self.__rus_vowels = set('аоуэыияёюе')
        self.exception_for_unknown = exception_for_unknown
        self.use_wiki = use_wiki
        self.__homonyms, self.__new_homonyms = {}, {}
        self.__simple_words_dawg = None  # maps unicode key -> int index of accented vowel
        self.__function_words = None
        self.__new_simple_words = set()
        self.__bad_words = []  # Too many accent variants, or unknown accent
        self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U)
        self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U)

        assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!'
        assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!'

        homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json')
        assert os.path.isfile(homograph_dictionary_name), f'File `{homograph_dictionary_name}` does not exist!'
        simple_words_dawg_name = os.path.join(os.path.dirname(__file__), 'data', 'simple_words.dawg')
        assert os.path.isfile(simple_words_dawg_name), f'File `{simple_words_dawg_name}` does not exist!'
        function_words_name = os.path.join(os.path.dirname(__file__), 'data', 'Function_words.json')
        assert os.path.isfile(function_words_name), f'File `{function_words_name}` does not exist!'
        data = None

        try:
            d = dawg.IntDAWG()
            self.__simple_words_dawg = d.load(simple_words_dawg_name)

            with codecs.open(homograph_dictionary_name, mode='r', encoding='utf-8', errors='ignore') as fp:
                data = json.load(fp)
            error_message_homographs = f'File `{homograph_dictionary_name}` contains incorrect data!'
            assert isinstance(data, dict), error_message_homographs

            for cur_wordform in data:
                assert self.check_source_wordform(cur_wordform), \
                    error_message_homographs + f' Word `{cur_wordform}` is inadmissible!'
                assert (cur_wordform not in self.__homonyms) and \
                       (cur_wordform.lower() not in self.__simple_words_dawg), \
                       error_message_homographs + f' Word `{cur_wordform}` is repeated!'
                assert isinstance(data[cur_wordform], dict), \
                    error_message_homographs + \
                    f' Word `{cur_wordform}` has incorrect description of accents!'

                for cur_key in data[cur_wordform]:
                    assert self.check_morphotag(cur_key), \
                        error_message_homographs + \
                        f' Word `{cur_wordform}` has incorrect description of accents!'
                    assert self.check_accented_wordform(data[cur_wordform][cur_key]), \
                        error_message_homographs + \
                        f' Word `{cur_wordform}` has incorrect description of accents!'
                # values = [data[cur_wordform][it] for it in data[cur_wordform]]
                # FIXME variable `values` is unused
                self.__homonyms[cur_wordform] = copy.deepcopy(data[cur_wordform])

            self.__function_words = None
            with codecs.open(function_words_name, mode='r', encoding='utf-8', errors='ignore') as fp:
                function_words = json.load(fp)
            error_message_function_words = f'File `{function_words_name}` contains incorrect data!'
            assert isinstance(function_words, list), error_message_function_words
            assert isinstance(function_words[0], str), error_message_function_words
            self.__function_words = function_words
        finally:
            if data is not None:
                del data