예제 #1
0
 def __init__(self, gram_dump_path_input: str, gram_dump_path_output: str):
     self.grammeme_vectorizer_input = GrammemeVectorizer(
         gram_dump_path_input)  # type: GrammemeVectorizer
     self.grammeme_vectorizer_output = GrammemeVectorizer(
         gram_dump_path_output)  # type: GrammemeVectorizer
     self.morph = pymorphy2.MorphAnalyzer()  # type: pymorphy2.MorphAnalyzer
     self.converter = converters.converter('opencorpora-int', 'ud14')
예제 #2
0
 def __init__(self):
     self.grammeme_vectorizer_input = GrammemeVectorizer()
     self.grammeme_vectorizer_output = GrammemeVectorizer()
     self.word_dictionary = WordDictionary()
     self.char_set = set()
     self.morph = MorphAnalyzer() # pyMorphy2
     self.converter = converters.converter('opencorpora-int', 'ud14')
예제 #3
0
def transcribe_words(source_words_list):
    n_words = len(source_words_list)
    n_parts = 100
    part_size = n_words // n_parts
    while (part_size * n_parts) < n_words:
        part_size += 1
    transcriptions = []
    bad_words = []
    to_ud2 = converters.converter('opencorpora-int', 'ud20')
    morph = pymorphy2.MorphAnalyzer()
    accentor = Accentor(exception_for_unknown=True, use_wiki=False)
    g2p = Grapheme2Phoneme(exception_for_nonaccented=True)
    russian_letters = set(
        'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя')
    russian_consonants = set('БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШшЩщЪъЬь')
    part_counter = 0
    for word_idx in range(len(source_words_list)):
        cur_word = source_words_list[word_idx].strip().lower()
        err_msg = 'Word {0} is wrong!'.format(word_idx)
        assert len(cur_word) > 0, err_msg + ' It is empty!'
        assert set(cur_word) <= (russian_letters | {'-'}), \
            err_msg + ' "{0}" contains an inadmissible characters.'.format(cur_word)
        assert set(cur_word) != {'-'}, err_msg + ' It is empty!'
        if (len(cur_word) > 1) and (set(cur_word) <= russian_consonants):
            bad_words.append(cur_word)
        else:
            morpho_variants = set(
                [to_ud2(str(it.tag)) for it in morph.parse(cur_word)])
            try:
                accentuation_variants = []
                for it in morpho_variants:
                    accentuation_variants += accentor.do_accents(
                        [[cur_word, it]])[0]
                variants_of_transcriptions = list(
                    set(
                        filter(
                            lambda it2: len(it2) > 0,
                            map(lambda it: tuple(g2p.word_to_phonemes(it)),
                                accentuation_variants))))
                if len(variants_of_transcriptions) > 0:
                    transcriptions.append(
                        (cur_word, ' '.join(variants_of_transcriptions[0])))
                    if len(variants_of_transcriptions) > 1:
                        for variant_idx in range(
                                1, len(variants_of_transcriptions)):
                            transcriptions.append(
                                ('{0}({1})'.format(cur_word, variant_idx + 1),
                                 ' '.join(
                                     variants_of_transcriptions[variant_idx])))
                else:
                    bad_words.append(cur_word)
            except:
                bad_words.append(cur_word)
        if ((word_idx + 1) % part_size) == 0:
            part_counter += 1
            print('{0:.2%} of words have been processed...'.format(
                part_counter / float(n_parts)))
    if part_counter < n_parts:
        print('100.00% of words have been processed...')
    return transcriptions, bad_words
예제 #4
0
    def __init__(self, gensim_emb, texts, trees_n=10):
        self.gensim_emb = gensim_emb
        self.morph = pymorphy2.MorphAnalyzer()
        self.tag_conv = converters.converter('opencorpora-int', 'ud20')
        self.tag_cache = {}

        self.id2text = list(sorted(set(texts)))

        textid2tokens = [[
            tok + '_' + self.get_tag(tok) for tok in txt.split(' ')
        ] for txt in self.id2text]
        tokenid2token = [
            tok for tok in sorted(
                set(tok for txt_toks in textid2tokens for tok in txt_toks))
            if tok in self.gensim_emb.vocab
        ]
        token2tokenid = {tok: i for i, tok in enumerate(tokenid2token)}
        self.tokenid2vec = [self.gensim_emb[tok] for tok in tokenid2token]

        self.tokenid2textid = collections.defaultdict(set)
        self.text2tokenid = collections.defaultdict(set)
        for txt_i, txt_toks in enumerate(textid2tokens):
            txt = self.id2text[txt_i]
            for tok in txt_toks:
                tok_id = token2tokenid.get(tok, None)
                if tok_id is not None:
                    self.tokenid2textid[tok_id].add(txt_i)
                    self.text2tokenid[txt].add(tok_id)

        self.vector_idx = annoy.AnnoyIndex(self.gensim_emb.vectors.shape[1],
                                           'angular')
        for tok_i, tok_vec in enumerate(self.tokenid2vec):
            self.vector_idx.add_item(tok_i, tok_vec)
        self.vector_idx.build(trees_n)
예제 #5
0
 def pymorphy_process(input_filename, output_filename):
     """
     Сделать морфоразметку на вход генератору с помощью pymorphy2 и russian-tagsets.
     
     :param input_filename: входной файл - raw текст.
     :param output_filename: выходной файл - разметка.
     """
     morph = pymorphy2.MorphAnalyzer()
     to_ud = converters.converter('opencorpora-int', 'ud14')
     with open(input_filename, "r", encoding="utf-8") as inp:
         with open(output_filename, "w", encoding="utf-8") as out:
             for line in inp:
                 tokens = Tokenizer.tokenize(line)
                 accepted_types = [Token.TokenType.WORD]
                 tokens = [
                     token for token in tokens
                     if token.token_type in accepted_types
                 ]
                 for token in tokens:
                     text = token.text.lower()
                     parse = morph.parse(text)[0]
                     lemma = parse.normal_form
                     ud_tag = to_ud(str(parse.tag), text)
                     pos = ud_tag.split()[0]
                     gram = ud_tag.split()[1]
                     out.write("%s\t%s\t%s\t%s\n" %
                               (text, lemma, pos, gram))
                 out.write("\n")
예제 #6
0
 def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
     super().__init__(save_path, load_path, **kwargs)
     self.max_pymorphy_variants = max_pymorphy_variants
     self.load()
     self.memorized_word_indexes = dict()
     self.memorized_tag_indexes = dict()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud20')
예제 #7
0
 def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
     super().__init__(save_path, load_path, **kwargs)
     self.max_pymorphy_variants = max_pymorphy_variants
     self.load()
     self.memorized_word_indexes = dict()
     self.memorized_tag_indexes = dict()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud20')
예제 #8
0
 def __init__(self):
     self.morph = MorphAnalyzer()  # использ. pyMorphy2
     self.converter = converters.converter('opencorpora-int', 'ud14')
     self.grammeme_vectorizer_input = GrammemeVectorizer()
     self.grammeme_vectorizer_output = GrammemeVectorizer()
     self.word_dictionary = WordDictionary()
     self.char_set = ""
     self.train_model = None
     self.main_model = None
예제 #9
0
 def __init__(self, language: str):
     self.language = language  # type: str
     self.morph = MorphAnalyzer() if language == "ru" else None  # type: MorphAnalyzer
     self.converter = converters.converter('opencorpora-int', 'ud14') if self.language == "ru" else None
     self.grammeme_vectorizer_input = GrammemeVectorizer()  # type: GrammemeVectorizer
     self.grammeme_vectorizer_output = GrammemeVectorizer()  # type: GrammemeVectorizer
     self.word_vocabulary = WordVocabulary()  # type: WordVocabulary
     self.char_set = ""  # type: str
     self.train_model = None  # type: Model
     self.eval_model = None  # type: Model
예제 #10
0
 def __init__(self,
              save_path: Optional[str] = None,
              load_path: Optional[str] = None,
              transform_lemmas=False,
              **kwargs) -> None:
     self.transform_lemmas = transform_lemmas
     self._reset()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter("opencorpora-int", "ud20")
     super().__init__(save_path, load_path, **kwargs)
예제 #11
0
 def __init__(self,
              save_path: Optional[str] = None,
              load_path: Optional[str] = None,
              rare_grammeme_penalty: float = 1.0,
              long_lemma_penalty: float = 1.0,
              **kwargs) -> None:
     self.rare_grammeme_penalty = rare_grammeme_penalty
     self.long_lemma_penalty = long_lemma_penalty
     self._reset()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter("opencorpora-int", "ud20")
     super().__init__(save_path, load_path, **kwargs)
예제 #12
0
파일: loader.py 프로젝트: che1974/rupo
 def __process_line(self, line: str) -> None:
     text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
     self.word_vocabulary.add_word(text)
     self.grammeme_vectorizer.add_grammemes(pos_tag, grammemes)
     to_ud = converters.converter('opencorpora-int', 'ud14')
     for parse in self.morph.parse(text):
         ud_tag = to_ud(str(parse.tag), text)
         pos = ud_tag.split()[0]
         gram = ud_tag.split()[1].split("|")
         dropped = ["Animacy", "Aspect", "NumType"]
         gram = [
             grammem for grammem in gram
             if sum([drop in grammem for drop in dropped]) == 0
         ]
         gram = "|".join(gram)
         self.grammeme_vectorizer.add_grammemes(pos, gram)
예제 #13
0
 def __init__(self,
              static_dir="../static",
              vectorizer="charwb_vectorizer",
              classifier="genre_classifier"):
     # self.morph_predictor = RNNMorphPredictor(language='ru')
     self.secondary_analyzer = MorphAnalyzer()
     with open(os.path.join(static_dir, "{}.pkl".format(vectorizer)),
               "rb") as inf:
         self.vectorizer = pickle.load(inf)
     with open(os.path.join(static_dir, "{}.pkl".format(classifier)),
               "rb") as inf:
         self.genre_classifier = pickle.load(inf)
     with open(os.path.join(static_dir, "lists.json")) as inf:
         self.dicts = json.load(inf)
     self.latin = re.compile("^[A-Za-z0-9]+$")
     self.digit = re.compile("^[0-9]+$")
     self.pm_to_ud = converters.converter('opencorpora-int', 'ud20')
예제 #14
0
 def __init__(self, file_names: List[str], train_config,
              grammeme_vectorizer_input: GrammemsVectorizer,
              grammeme_vectorizer_output: GrammemsVectorizer,
              endings_vectorizer: EndingsVectorizer, indices: np.array,
              build_config):
     self.file_names = file_names  # type: List[str]
     # Параметры батчей.
     self.batch_size = train_config.external_batch_size  # type: int
     self.bucket_borders = train_config.sentence_len_groups  # type: List[Tuple[int]]
     self.buckets = [list() for _ in range(len(self.bucket_borders))]
     self.build_config = build_config
     # Разбиение на выборки.
     self.indices = indices  # type: np.array
     # Подготовленные словари.
     self.grammeme_vectorizer_input = grammeme_vectorizer_input  # type: GrammemeVectorizer
     self.grammeme_vectorizer_output = grammeme_vectorizer_output  # type: GrammemeVectorizer
     self.endings_vectorizer = endings_vectorizer
     self.morph = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud14')
예제 #15
0
 def __init__(self, file_names: List[str], config: TrainConfig,
              grammeme_vectorizer_input: GrammemeVectorizer,
              grammeme_vectorizer_output: GrammemeVectorizer,
              indices: np.array, word_dictionary: WordDictionary,
              char_set: str, build_config: BuildModelConfig):
     self.file_names = file_names  # type: List[str]
     # Параметры наборов.
     self.training_set_size = config.external_batch_size  # type: int
     self.bucket_borders = config.sentence_len_groups  # type: List[Tuple[int]]
     self.buckets = [list() for _ in range(len(self.bucket_borders))]
     self.build_config = build_config
     self.word_dictionary = word_dictionary
     self.char_set = char_set
     # Разбиение на выборки.
     self.indices = indices  # type: np.array
     # Подготовленные словари.
     self.grammeme_vectorizer_input = grammeme_vectorizer_input  # type: GrammemeVectorizer
     self.grammeme_vectorizer_output = grammeme_vectorizer_output  # type: GrammemeVectorizer
     self.morph = MorphAnalyzer()  # type: MorphAnalyzer
     self.converter = converters.converter('opencorpora-int', 'ud14')
예제 #16
0
    def __init__(self):
        #Подключаем конфиги
        main_model_config_path = MODELS_PATHS["main_model_config"]
        main_model_weights_path = MODELS_PATHS["main_model_weights"]
        gram_dict_input = MODELS_PATHS["gram_input"]
        gram_dict_output = MODELS_PATHS["gram_output"]
        word_dictionary = MODELS_PATHS["word_dictionary"]
        char_set_path = MODELS_PATHS["char_set"]
        build_config = MODELS_PATHS["build_config"]

        self.converter = converters.converter('opencorpora-int', 'ud14')
        self.morph = MorphAnalyzer()  #Pymorphy2

        self.build_config = BuildModelConfig()
        self.build_config.load(build_config)

        self.model = LSTMModel()
        self.model.prepare(gram_dict_input, gram_dict_output, word_dictionary,
                           char_set_path)
        self.model.load_main_model(self.build_config, main_model_config_path,
                                   main_model_weights_path)
예제 #17
0
    def get_sample(sentence: List[str], morph: pymorphy2.MorphAnalyzer,
                   grammeme_vectorizer: GrammemeVectorizer, max_word_len: int):
        """
        Получние признаков для отдельного предложения.
        
        :param sentence: предложение.
        :param morph: морфология.
        :param grammeme_vectorizer: грамматический словарь. 
        :param max_word_len: количество обрабатываемых букв в слове.
        :return: индексы слов, грамматические векторы, индексы символов.
        """
        to_ud = converters.converter('opencorpora-int', 'ud14')
        word_char_vectors = []
        word_gram_vectors = []
        for word in sentence:
            char_indices = np.zeros(max_word_len)
            gram_value_indices = np.zeros(grammeme_vectorizer.grammemes_count())

            # Индексы символов слова.
            word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word][:max_word_len]
            char_indices[-min(len(word), max_word_len):] = word_char_indices
            word_char_vectors.append(char_indices)

            # Грамматический вектор слова.
            # Складываем все возможные варианты разбора поэлементно.
            for parse in morph.parse(word):
                pos, gram = convert_from_opencorpora_tag(to_ud, parse.tag, word)
                gram = process_gram_tag(gram)
                gram_value_indices += np.array(grammeme_vectorizer.get_vector(pos + "#" + gram))
            # Нормируем по каждой категории отдельно.
            sorted_grammemes = sorted(grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0])
            index = 0
            for category, values in sorted_grammemes:
                mask = gram_value_indices[index:index+len(values)]
                s = sum(mask)
                gram_value_indices[index:index+len(values)] = mask/s
                index += len(values)
            word_gram_vectors.append(gram_value_indices)

        return word_gram_vectors, word_char_vectors
예제 #18
0
    def __init__(self,
                 language="ru",
                 eval_model_config_path: str=None,
                 eval_model_weights_path: str=None,
                 gram_dict_input: str=None,
                 gram_dict_output: str=None,
                 word_vocabulary: str=None,
                 char_set_path: str=None,
                 build_config: str=None):
        if eval_model_config_path is None:
            eval_model_config_path = MODELS_PATHS[language]["eval_model_config"]
        if eval_model_weights_path is None:
            eval_model_weights_path = MODELS_PATHS[language]["eval_model_weights"]
        if gram_dict_input is None:
            gram_dict_input = MODELS_PATHS[language]["gram_input"]
        if gram_dict_output is None:
            gram_dict_output = MODELS_PATHS[language]["gram_output"]
        if word_vocabulary is None:
            word_vocabulary = MODELS_PATHS[language]["word_vocabulary"]
        if char_set_path is None:
            char_set_path = MODELS_PATHS[language]["char_set"]
        if build_config is None:
            build_config = MODELS_PATHS[language]["build_config"]

        self.language = language
        self.converter = converters.converter('opencorpora-int', 'ud14') if language == "ru" else None
        self.morph = MorphAnalyzer() if language == "ru" else None
        if self.language == "en":
            nltk.download("wordnet")
            nltk.download('averaged_perceptron_tagger')
            nltk.download('universal_tagset')

        self.build_config = BuildModelConfig()
        self.build_config.load(build_config)

        self.model = LSTMMorphoAnalysis(language=language)
        self.model.prepare(gram_dict_input, gram_dict_output, word_vocabulary, char_set_path)
        self.model.load_eval(self.build_config, eval_model_config_path, eval_model_weights_path)
예제 #19
0
 def __get_lemma(self,
                 word: str,
                 pos_tag: str,
                 gram: str,
                 enable_gikrya_normalization: bool = True):
     """
     Получить лемму.
     
     :param word: слово.
     :param pos_tag: часть речи.
     :param gram: граммаическое значение.
     :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ.
     :return: лемма.
     """
     if '_' in word:
         return word
     to_ud = converters.converter('opencorpora-int', 'ud14')
     guess = ""
     max_common_tags = 0
     for word_form in self.morph.parse(word):
         word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(
             to_ud, word_form.tag, word)
         word_form_gram = process_gram_tag(word_form_gram)
         common_tags_len = len(
             set(word_form_gram.split("|")).intersection(
                 set(gram.split("|"))))
         if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
             max_common_tags = common_tags_len
             guess = word_form
     if guess == "":
         guess = self.morph.parse(word)[0]
     if enable_gikrya_normalization:
         lemma = self.__normalize_for_gikrya(guess)
     else:
         lemma = guess.normal_form
     return lemma
예제 #20
0
import argparse
import logging
import string
from pathlib import Path

import pymystem3

from ufal import udpipe  # pylint: disable=no-name-in-module

from russian_tagsets import converters

mystem = pymystem3.Mystem()
mystem_to_ud20 = converters.converter('mystem', 'ud20')


def tag_with_mystem(sent):
    mystemed_words = mystem.analyze(sent.getText())

    root = udpipe.Word(0)
    udpipe_words = [root]  # не уверен, но первое слово игнориться

    index = 1
    space_after = False

    for w in mystemed_words:
        form = w['text'].strip()

        # Spaces?
        if not form:
            space_after = True
        else:
예제 #21
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-s', '--src', dest='src_name', type=str, required=True,
                        help='A text file with source dictionary.')
    parser.add_argument('-d', '--dst', dest='dst_name', type=str, required=True,
                        help='A JSON file with destination dictionary.')
    args = parser.parse_args()

    with codecs.open(args.src_name, mode='r', encoding='utf-8', errors='ignore') as fp:
        all_words = sorted(list(set(
            map(
                lambda it5: it5[1],
                filter(
                    lambda it4: check_word(it4[1]) and (int(it4[0]) >= 10),
                    map(
                        lambda it3: it3.lower().strip().split(),
                        filter(lambda it2: len(it2) > 0, map(lambda it1: it1.strip(), fp.readlines()))
                    )
                )
            )
        )))
    print('Number of selected words is {0}.'.format(len(all_words)))
    words_dict = dict()
    morph = pymorphy2.MorphAnalyzer()
    to_ud20 = converters.converter('opencorpora-int', 'ud20')
    engine = Engine(language="ru")
    engine.load(
        os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'stress_models',
                     'stress_ru_LSTM64_dropout0.2_acc99_wer8.h5'),
        os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'dict', 'zaliznyak.txt')
    )
    syllables_of_words = dict()
    counter = 0
    unknown_counter = 0
    for cur_word in all_words:
        if cur_word in syllables_of_words:
            n_syllables = syllables_of_words[cur_word]
        else:
            n_syllables = len(engine.get_word_syllables(cur_word))
            syllables_of_words[cur_word] = n_syllables
        if n_syllables == 0:
            continue
        parsing = morph.parse(cur_word)
        if unknown_word(parsing):
            unknown_counter += 1
        else:
            for it in parsing:
                morphodata = get_morphodata(to_ud20(str(it.tag)))
                if morphodata is None:
                    continue
                if morphodata in words_dict:
                    if n_syllables in words_dict[morphodata]:
                        words_dict[morphodata][n_syllables].add(cur_word)
                    else:
                        words_dict[morphodata][n_syllables] = {cur_word}
                else:
                    words_dict[morphodata] = {n_syllables: {cur_word}}
        counter += 1
        if counter % 10000 == 0:
            print('{0} words have been processed...'.format(counter))
    print('There are {0} unknown words.'.format(unknown_counter))
    for morphodata in words_dict:
        for n_syllables in words_dict[morphodata]:
            words_dict[morphodata][n_syllables] = sorted(list(words_dict[morphodata][n_syllables]))
    with codecs.open(args.dst_name, mode='w', encoding='utf-8', errors='ignore') as fp:
        json.dump(words_dict, fp, ensure_ascii=False, indent=4)
예제 #22
0
# -*- coding: utf-8 -*-

import sys
import argparse
import bz2
import xmltodict
import random

from russian_tagsets import converters

SKIP_DOCS = [
    '4063'  # "Слово о полку Игореве"
]
SPACE_CHARS = [' ', '\t', '\u00a0', '\u200e', '\u200f']
all_docs = []
to_ud = converters.converter('opencorpora-int', 'ud20')


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', help='OpenCorpora corpus dump file')
    parser.add_argument('-o',
                        '--output',
                        help='Output file prefix',
                        type=str,
                        default='-')
    parser.add_argument('-s',
                        '--seed',
                        help='Random seed',
                        type=int,
                        default=0)
예제 #23
0
from typing import List, Tuple, Set, Optional

from pymorphy2 import MorphAnalyzer
from razdel import tokenize as regex_tokenize
from b_labs_models import Tokenizer, POSTagger
from russian_tagsets import converters

opencorpora_to_ud_convert = converters.converter('opencorpora-int', 'ud14')


class BaseTokenizer:
    '''
    Base tokenizer interface
    '''
    def split(self,
              text: str,
              lemmatize: bool = True) -> List[Tuple[str, Optional[str]]]:
        '''
        Each subclass of BaseTokenizer must inplement this method
        Returns list with tuples like that one:
        [('приехать', 'VERB'), ('дом', 'NOUN')]
        '''
        raise NotImplementedError


class RegexTokenizer(BaseTokenizer):
    '''
    Tokenizer based on one of most accurate and fast tokenizers for russian
    language text - razdel. This tokenizer doesn't performs POS tagging
    and lemmatization.
    '''
예제 #24
0
def main():
    if sys.argv.__len__() > 2:
        source_dictionary_name = os.path.normpath(sys.argv[1])
        prepared_dictionary_name = os.path.normpath(sys.argv[2])
        words_and_accents = load_accents_dictionary(source_dictionary_name)
        simple_wordsforms = list()
        homonyms = dict()
        morph = pymorphy2.MorphAnalyzer()
        to_ud20 = converters.converter('opencorpora-int', 'ud20')
        for cur_word in words_and_accents:
            variants_of_accents = sorted(list(words_and_accents[cur_word].keys()))
            if len(variants_of_accents) > 1:
                lemmas = set(words_and_accents[cur_word].values())
                str_width = len(str(len(variants_of_accents)))
                homonyms[cur_word] = {}
                if len(lemmas) == len(variants_of_accents):
                    for ind in range(len(variants_of_accents)):
                        lemma = words_and_accents[cur_word][variants_of_accents[ind]]
                        morpho_variants = morph.parse(cur_word)
                        best_morpho = None
                        for cur_morpho in morpho_variants:
                            if cur_morpho.normal_form == lemma:
                                best_morpho = cur_morpho
                                break
                        if best_morpho is None:
                            best_similarity = 0
                            for cur_morpho in morpho_variants:
                                if cur_morpho.normal_form.startswith(lemma) or lemma.startswith(cur_morpho.normal_form):
                                    cur_similarity = min(len(lemma), len(cur_morpho.normal_form))
                                    if cur_similarity > best_similarity:
                                        best_morpho = cur_morpho
                                        best_similarity = cur_similarity
                        if best_morpho is None:
                            homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind]
                        else:
                            if str(best_morpho.methods_stack[0][0]) == '<DictionaryAnalyzer>':
                                morpho_tag = to_ud20(str(best_morpho.tag))
                                if morpho_tag in homonyms[cur_word]:
                                    counter = 2
                                    while (morpho_tag + '({0})'.format(counter)) in homonyms[cur_word]:
                                        counter += 1
                                    morpho_tag += '({0})'.format(counter)
                                homonyms[cur_word][morpho_tag] = variants_of_accents[ind]
                            else:
                                homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind]
                else:
                    if len(lemmas) == 1:
                        lemma_morpho = morph.parse(list(lemmas)[0])[0]
                        if str(lemma_morpho.methods_stack[0][0]) == '<DictionaryAnalyzer>':
                            lexeme_counter = 0
                            for it in lemma_morpho.lexeme:
                                if it.word == cur_word:
                                    lexeme_counter += 1
                            if lexeme_counter == 1:
                                simple_wordsforms.append(variants_of_accents[0])
                                del homonyms[cur_word]
                            else:
                                for ind in range(len(variants_of_accents)):
                                    variant_name = '{0:>0{1}}'.format(ind + 1, str_width)
                                    homonyms[cur_word][variant_name] = variants_of_accents[ind]
                        else:
                            for ind in range(len(variants_of_accents)):
                                homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind]
                    else:
                        for ind in range(len(variants_of_accents)):
                            homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind]
            else:
                simple_wordsforms.append(variants_of_accents[0])
        print('`{0}`: dictionary has been loaded from this file...'.format(source_dictionary_name))
        with codecs.open(prepared_dictionary_name, mode='w', encoding='utf-8', errors='ignore') as fp:
            json.dump([homonyms, sorted(simple_wordsforms)], fp, ensure_ascii=False, indent=4, sort_keys=True)
        print('`{0}`: dictionary has been saved into this file...'.format(prepared_dictionary_name))
        print('Number of homonyms is {0}.'.format(len(homonyms)))
    else:
        print("Usage: input_dictionary.txt prepared_dictionary.json")
예제 #25
0
 def __init__(self, n_endings=3, lower=True):
     self.grammeme_vectorizer_input = GrammemsVectorizer()
     self.grammeme_vectorizer_output = GrammemsVectorizer()
     self.endings_vectorizer = EndingsVectorizer(n_endings, lower)
     self.morph = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud14')