def __init__(self, gram_dump_path_input: str, gram_dump_path_output: str): self.grammeme_vectorizer_input = GrammemeVectorizer( gram_dump_path_input) # type: GrammemeVectorizer self.grammeme_vectorizer_output = GrammemeVectorizer( gram_dump_path_output) # type: GrammemeVectorizer self.morph = pymorphy2.MorphAnalyzer() # type: pymorphy2.MorphAnalyzer self.converter = converters.converter('opencorpora-int', 'ud14')
def __init__(self): self.grammeme_vectorizer_input = GrammemeVectorizer() self.grammeme_vectorizer_output = GrammemeVectorizer() self.word_dictionary = WordDictionary() self.char_set = set() self.morph = MorphAnalyzer() # pyMorphy2 self.converter = converters.converter('opencorpora-int', 'ud14')
def transcribe_words(source_words_list): n_words = len(source_words_list) n_parts = 100 part_size = n_words // n_parts while (part_size * n_parts) < n_words: part_size += 1 transcriptions = [] bad_words = [] to_ud2 = converters.converter('opencorpora-int', 'ud20') morph = pymorphy2.MorphAnalyzer() accentor = Accentor(exception_for_unknown=True, use_wiki=False) g2p = Grapheme2Phoneme(exception_for_nonaccented=True) russian_letters = set( 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя') russian_consonants = set('БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШшЩщЪъЬь') part_counter = 0 for word_idx in range(len(source_words_list)): cur_word = source_words_list[word_idx].strip().lower() err_msg = 'Word {0} is wrong!'.format(word_idx) assert len(cur_word) > 0, err_msg + ' It is empty!' assert set(cur_word) <= (russian_letters | {'-'}), \ err_msg + ' "{0}" contains an inadmissible characters.'.format(cur_word) assert set(cur_word) != {'-'}, err_msg + ' It is empty!' if (len(cur_word) > 1) and (set(cur_word) <= russian_consonants): bad_words.append(cur_word) else: morpho_variants = set( [to_ud2(str(it.tag)) for it in morph.parse(cur_word)]) try: accentuation_variants = [] for it in morpho_variants: accentuation_variants += accentor.do_accents( [[cur_word, it]])[0] variants_of_transcriptions = list( set( filter( lambda it2: len(it2) > 0, map(lambda it: tuple(g2p.word_to_phonemes(it)), accentuation_variants)))) if len(variants_of_transcriptions) > 0: transcriptions.append( (cur_word, ' '.join(variants_of_transcriptions[0]))) if len(variants_of_transcriptions) > 1: for variant_idx in range( 1, len(variants_of_transcriptions)): transcriptions.append( ('{0}({1})'.format(cur_word, variant_idx + 1), ' '.join( variants_of_transcriptions[variant_idx]))) else: bad_words.append(cur_word) except: bad_words.append(cur_word) if ((word_idx + 1) % part_size) == 0: part_counter += 1 print('{0:.2%} of words have been processed...'.format( part_counter / float(n_parts))) if part_counter < n_parts: print('100.00% of words have been processed...') return transcriptions, bad_words
def __init__(self, gensim_emb, texts, trees_n=10): self.gensim_emb = gensim_emb self.morph = pymorphy2.MorphAnalyzer() self.tag_conv = converters.converter('opencorpora-int', 'ud20') self.tag_cache = {} self.id2text = list(sorted(set(texts))) textid2tokens = [[ tok + '_' + self.get_tag(tok) for tok in txt.split(' ') ] for txt in self.id2text] tokenid2token = [ tok for tok in sorted( set(tok for txt_toks in textid2tokens for tok in txt_toks)) if tok in self.gensim_emb.vocab ] token2tokenid = {tok: i for i, tok in enumerate(tokenid2token)} self.tokenid2vec = [self.gensim_emb[tok] for tok in tokenid2token] self.tokenid2textid = collections.defaultdict(set) self.text2tokenid = collections.defaultdict(set) for txt_i, txt_toks in enumerate(textid2tokens): txt = self.id2text[txt_i] for tok in txt_toks: tok_id = token2tokenid.get(tok, None) if tok_id is not None: self.tokenid2textid[tok_id].add(txt_i) self.text2tokenid[txt].add(tok_id) self.vector_idx = annoy.AnnoyIndex(self.gensim_emb.vectors.shape[1], 'angular') for tok_i, tok_vec in enumerate(self.tokenid2vec): self.vector_idx.add_item(tok_i, tok_vec) self.vector_idx.build(trees_n)
def pymorphy_process(input_filename, output_filename): """ Сделать морфоразметку на вход генератору с помощью pymorphy2 и russian-tagsets. :param input_filename: входной файл - raw текст. :param output_filename: выходной файл - разметка. """ morph = pymorphy2.MorphAnalyzer() to_ud = converters.converter('opencorpora-int', 'ud14') with open(input_filename, "r", encoding="utf-8") as inp: with open(output_filename, "w", encoding="utf-8") as out: for line in inp: tokens = Tokenizer.tokenize(line) accepted_types = [Token.TokenType.WORD] tokens = [ token for token in tokens if token.token_type in accepted_types ] for token in tokens: text = token.text.lower() parse = morph.parse(text)[0] lemma = parse.normal_form ud_tag = to_ud(str(parse.tag), text) pos = ud_tag.split()[0] gram = ud_tag.split()[1] out.write("%s\t%s\t%s\t%s\n" % (text, lemma, pos, gram)) out.write("\n")
def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20')
def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20')
def __init__(self): self.morph = MorphAnalyzer() # использ. pyMorphy2 self.converter = converters.converter('opencorpora-int', 'ud14') self.grammeme_vectorizer_input = GrammemeVectorizer() self.grammeme_vectorizer_output = GrammemeVectorizer() self.word_dictionary = WordDictionary() self.char_set = "" self.train_model = None self.main_model = None
def __init__(self, language: str): self.language = language # type: str self.morph = MorphAnalyzer() if language == "ru" else None # type: MorphAnalyzer self.converter = converters.converter('opencorpora-int', 'ud14') if self.language == "ru" else None self.grammeme_vectorizer_input = GrammemeVectorizer() # type: GrammemeVectorizer self.grammeme_vectorizer_output = GrammemeVectorizer() # type: GrammemeVectorizer self.word_vocabulary = WordVocabulary() # type: WordVocabulary self.char_set = "" # type: str self.train_model = None # type: Model self.eval_model = None # type: Model
def __init__(self, save_path: Optional[str] = None, load_path: Optional[str] = None, transform_lemmas=False, **kwargs) -> None: self.transform_lemmas = transform_lemmas self._reset() self.analyzer = MorphAnalyzer() self.converter = converters.converter("opencorpora-int", "ud20") super().__init__(save_path, load_path, **kwargs)
def __init__(self, save_path: Optional[str] = None, load_path: Optional[str] = None, rare_grammeme_penalty: float = 1.0, long_lemma_penalty: float = 1.0, **kwargs) -> None: self.rare_grammeme_penalty = rare_grammeme_penalty self.long_lemma_penalty = long_lemma_penalty self._reset() self.analyzer = MorphAnalyzer() self.converter = converters.converter("opencorpora-int", "ud20") super().__init__(save_path, load_path, **kwargs)
def __process_line(self, line: str) -> None: text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4] self.word_vocabulary.add_word(text) self.grammeme_vectorizer.add_grammemes(pos_tag, grammemes) to_ud = converters.converter('opencorpora-int', 'ud14') for parse in self.morph.parse(text): ud_tag = to_ud(str(parse.tag), text) pos = ud_tag.split()[0] gram = ud_tag.split()[1].split("|") dropped = ["Animacy", "Aspect", "NumType"] gram = [ grammem for grammem in gram if sum([drop in grammem for drop in dropped]) == 0 ] gram = "|".join(gram) self.grammeme_vectorizer.add_grammemes(pos, gram)
def __init__(self, static_dir="../static", vectorizer="charwb_vectorizer", classifier="genre_classifier"): # self.morph_predictor = RNNMorphPredictor(language='ru') self.secondary_analyzer = MorphAnalyzer() with open(os.path.join(static_dir, "{}.pkl".format(vectorizer)), "rb") as inf: self.vectorizer = pickle.load(inf) with open(os.path.join(static_dir, "{}.pkl".format(classifier)), "rb") as inf: self.genre_classifier = pickle.load(inf) with open(os.path.join(static_dir, "lists.json")) as inf: self.dicts = json.load(inf) self.latin = re.compile("^[A-Za-z0-9]+$") self.digit = re.compile("^[0-9]+$") self.pm_to_ud = converters.converter('opencorpora-int', 'ud20')
def __init__(self, file_names: List[str], train_config, grammeme_vectorizer_input: GrammemsVectorizer, grammeme_vectorizer_output: GrammemsVectorizer, endings_vectorizer: EndingsVectorizer, indices: np.array, build_config): self.file_names = file_names # type: List[str] # Параметры батчей. self.batch_size = train_config.external_batch_size # type: int self.bucket_borders = train_config.sentence_len_groups # type: List[Tuple[int]] self.buckets = [list() for _ in range(len(self.bucket_borders))] self.build_config = build_config # Разбиение на выборки. self.indices = indices # type: np.array # Подготовленные словари. self.grammeme_vectorizer_input = grammeme_vectorizer_input # type: GrammemeVectorizer self.grammeme_vectorizer_output = grammeme_vectorizer_output # type: GrammemeVectorizer self.endings_vectorizer = endings_vectorizer self.morph = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud14')
def __init__(self, file_names: List[str], config: TrainConfig, grammeme_vectorizer_input: GrammemeVectorizer, grammeme_vectorizer_output: GrammemeVectorizer, indices: np.array, word_dictionary: WordDictionary, char_set: str, build_config: BuildModelConfig): self.file_names = file_names # type: List[str] # Параметры наборов. self.training_set_size = config.external_batch_size # type: int self.bucket_borders = config.sentence_len_groups # type: List[Tuple[int]] self.buckets = [list() for _ in range(len(self.bucket_borders))] self.build_config = build_config self.word_dictionary = word_dictionary self.char_set = char_set # Разбиение на выборки. self.indices = indices # type: np.array # Подготовленные словари. self.grammeme_vectorizer_input = grammeme_vectorizer_input # type: GrammemeVectorizer self.grammeme_vectorizer_output = grammeme_vectorizer_output # type: GrammemeVectorizer self.morph = MorphAnalyzer() # type: MorphAnalyzer self.converter = converters.converter('opencorpora-int', 'ud14')
def __init__(self): #Подключаем конфиги main_model_config_path = MODELS_PATHS["main_model_config"] main_model_weights_path = MODELS_PATHS["main_model_weights"] gram_dict_input = MODELS_PATHS["gram_input"] gram_dict_output = MODELS_PATHS["gram_output"] word_dictionary = MODELS_PATHS["word_dictionary"] char_set_path = MODELS_PATHS["char_set"] build_config = MODELS_PATHS["build_config"] self.converter = converters.converter('opencorpora-int', 'ud14') self.morph = MorphAnalyzer() #Pymorphy2 self.build_config = BuildModelConfig() self.build_config.load(build_config) self.model = LSTMModel() self.model.prepare(gram_dict_input, gram_dict_output, word_dictionary, char_set_path) self.model.load_main_model(self.build_config, main_model_config_path, main_model_weights_path)
def get_sample(sentence: List[str], morph: pymorphy2.MorphAnalyzer, grammeme_vectorizer: GrammemeVectorizer, max_word_len: int): """ Получние признаков для отдельного предложения. :param sentence: предложение. :param morph: морфология. :param grammeme_vectorizer: грамматический словарь. :param max_word_len: количество обрабатываемых букв в слове. :return: индексы слов, грамматические векторы, индексы символов. """ to_ud = converters.converter('opencorpora-int', 'ud14') word_char_vectors = [] word_gram_vectors = [] for word in sentence: char_indices = np.zeros(max_word_len) gram_value_indices = np.zeros(grammeme_vectorizer.grammemes_count()) # Индексы символов слова. word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word][:max_word_len] char_indices[-min(len(word), max_word_len):] = word_char_indices word_char_vectors.append(char_indices) # Грамматический вектор слова. # Складываем все возможные варианты разбора поэлементно. for parse in morph.parse(word): pos, gram = convert_from_opencorpora_tag(to_ud, parse.tag, word) gram = process_gram_tag(gram) gram_value_indices += np.array(grammeme_vectorizer.get_vector(pos + "#" + gram)) # Нормируем по каждой категории отдельно. sorted_grammemes = sorted(grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0]) index = 0 for category, values in sorted_grammemes: mask = gram_value_indices[index:index+len(values)] s = sum(mask) gram_value_indices[index:index+len(values)] = mask/s index += len(values) word_gram_vectors.append(gram_value_indices) return word_gram_vectors, word_char_vectors
def __init__(self, language="ru", eval_model_config_path: str=None, eval_model_weights_path: str=None, gram_dict_input: str=None, gram_dict_output: str=None, word_vocabulary: str=None, char_set_path: str=None, build_config: str=None): if eval_model_config_path is None: eval_model_config_path = MODELS_PATHS[language]["eval_model_config"] if eval_model_weights_path is None: eval_model_weights_path = MODELS_PATHS[language]["eval_model_weights"] if gram_dict_input is None: gram_dict_input = MODELS_PATHS[language]["gram_input"] if gram_dict_output is None: gram_dict_output = MODELS_PATHS[language]["gram_output"] if word_vocabulary is None: word_vocabulary = MODELS_PATHS[language]["word_vocabulary"] if char_set_path is None: char_set_path = MODELS_PATHS[language]["char_set"] if build_config is None: build_config = MODELS_PATHS[language]["build_config"] self.language = language self.converter = converters.converter('opencorpora-int', 'ud14') if language == "ru" else None self.morph = MorphAnalyzer() if language == "ru" else None if self.language == "en": nltk.download("wordnet") nltk.download('averaged_perceptron_tagger') nltk.download('universal_tagset') self.build_config = BuildModelConfig() self.build_config.load(build_config) self.model = LSTMMorphoAnalysis(language=language) self.model.prepare(gram_dict_input, gram_dict_output, word_vocabulary, char_set_path) self.model.load_eval(self.build_config, eval_model_config_path, eval_model_weights_path)
def __get_lemma(self, word: str, pos_tag: str, gram: str, enable_gikrya_normalization: bool = True): """ Получить лемму. :param word: слово. :param pos_tag: часть речи. :param gram: граммаическое значение. :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ. :return: лемма. """ if '_' in word: return word to_ud = converters.converter('opencorpora-int', 'ud14') guess = "" max_common_tags = 0 for word_form in self.morph.parse(word): word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag( to_ud, word_form.tag, word) word_form_gram = process_gram_tag(word_form_gram) common_tags_len = len( set(word_form_gram.split("|")).intersection( set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = self.morph.parse(word)[0] if enable_gikrya_normalization: lemma = self.__normalize_for_gikrya(guess) else: lemma = guess.normal_form return lemma
import argparse import logging import string from pathlib import Path import pymystem3 from ufal import udpipe # pylint: disable=no-name-in-module from russian_tagsets import converters mystem = pymystem3.Mystem() mystem_to_ud20 = converters.converter('mystem', 'ud20') def tag_with_mystem(sent): mystemed_words = mystem.analyze(sent.getText()) root = udpipe.Word(0) udpipe_words = [root] # не уверен, но первое слово игнориться index = 1 space_after = False for w in mystemed_words: form = w['text'].strip() # Spaces? if not form: space_after = True else:
def main(): parser = ArgumentParser() parser.add_argument('-s', '--src', dest='src_name', type=str, required=True, help='A text file with source dictionary.') parser.add_argument('-d', '--dst', dest='dst_name', type=str, required=True, help='A JSON file with destination dictionary.') args = parser.parse_args() with codecs.open(args.src_name, mode='r', encoding='utf-8', errors='ignore') as fp: all_words = sorted(list(set( map( lambda it5: it5[1], filter( lambda it4: check_word(it4[1]) and (int(it4[0]) >= 10), map( lambda it3: it3.lower().strip().split(), filter(lambda it2: len(it2) > 0, map(lambda it1: it1.strip(), fp.readlines())) ) ) ) ))) print('Number of selected words is {0}.'.format(len(all_words))) words_dict = dict() morph = pymorphy2.MorphAnalyzer() to_ud20 = converters.converter('opencorpora-int', 'ud20') engine = Engine(language="ru") engine.load( os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'stress_models', 'stress_ru_LSTM64_dropout0.2_acc99_wer8.h5'), os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'dict', 'zaliznyak.txt') ) syllables_of_words = dict() counter = 0 unknown_counter = 0 for cur_word in all_words: if cur_word in syllables_of_words: n_syllables = syllables_of_words[cur_word] else: n_syllables = len(engine.get_word_syllables(cur_word)) syllables_of_words[cur_word] = n_syllables if n_syllables == 0: continue parsing = morph.parse(cur_word) if unknown_word(parsing): unknown_counter += 1 else: for it in parsing: morphodata = get_morphodata(to_ud20(str(it.tag))) if morphodata is None: continue if morphodata in words_dict: if n_syllables in words_dict[morphodata]: words_dict[morphodata][n_syllables].add(cur_word) else: words_dict[morphodata][n_syllables] = {cur_word} else: words_dict[morphodata] = {n_syllables: {cur_word}} counter += 1 if counter % 10000 == 0: print('{0} words have been processed...'.format(counter)) print('There are {0} unknown words.'.format(unknown_counter)) for morphodata in words_dict: for n_syllables in words_dict[morphodata]: words_dict[morphodata][n_syllables] = sorted(list(words_dict[morphodata][n_syllables])) with codecs.open(args.dst_name, mode='w', encoding='utf-8', errors='ignore') as fp: json.dump(words_dict, fp, ensure_ascii=False, indent=4)
# -*- coding: utf-8 -*- import sys import argparse import bz2 import xmltodict import random from russian_tagsets import converters SKIP_DOCS = [ '4063' # "Слово о полку Игореве" ] SPACE_CHARS = [' ', '\t', '\u00a0', '\u200e', '\u200f'] all_docs = [] to_ud = converters.converter('opencorpora-int', 'ud20') def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', help='OpenCorpora corpus dump file') parser.add_argument('-o', '--output', help='Output file prefix', type=str, default='-') parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0)
from typing import List, Tuple, Set, Optional from pymorphy2 import MorphAnalyzer from razdel import tokenize as regex_tokenize from b_labs_models import Tokenizer, POSTagger from russian_tagsets import converters opencorpora_to_ud_convert = converters.converter('opencorpora-int', 'ud14') class BaseTokenizer: ''' Base tokenizer interface ''' def split(self, text: str, lemmatize: bool = True) -> List[Tuple[str, Optional[str]]]: ''' Each subclass of BaseTokenizer must inplement this method Returns list with tuples like that one: [('приехать', 'VERB'), ('дом', 'NOUN')] ''' raise NotImplementedError class RegexTokenizer(BaseTokenizer): ''' Tokenizer based on one of most accurate and fast tokenizers for russian language text - razdel. This tokenizer doesn't performs POS tagging and lemmatization. '''
def main(): if sys.argv.__len__() > 2: source_dictionary_name = os.path.normpath(sys.argv[1]) prepared_dictionary_name = os.path.normpath(sys.argv[2]) words_and_accents = load_accents_dictionary(source_dictionary_name) simple_wordsforms = list() homonyms = dict() morph = pymorphy2.MorphAnalyzer() to_ud20 = converters.converter('opencorpora-int', 'ud20') for cur_word in words_and_accents: variants_of_accents = sorted(list(words_and_accents[cur_word].keys())) if len(variants_of_accents) > 1: lemmas = set(words_and_accents[cur_word].values()) str_width = len(str(len(variants_of_accents))) homonyms[cur_word] = {} if len(lemmas) == len(variants_of_accents): for ind in range(len(variants_of_accents)): lemma = words_and_accents[cur_word][variants_of_accents[ind]] morpho_variants = morph.parse(cur_word) best_morpho = None for cur_morpho in morpho_variants: if cur_morpho.normal_form == lemma: best_morpho = cur_morpho break if best_morpho is None: best_similarity = 0 for cur_morpho in morpho_variants: if cur_morpho.normal_form.startswith(lemma) or lemma.startswith(cur_morpho.normal_form): cur_similarity = min(len(lemma), len(cur_morpho.normal_form)) if cur_similarity > best_similarity: best_morpho = cur_morpho best_similarity = cur_similarity if best_morpho is None: homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind] else: if str(best_morpho.methods_stack[0][0]) == '<DictionaryAnalyzer>': morpho_tag = to_ud20(str(best_morpho.tag)) if morpho_tag in homonyms[cur_word]: counter = 2 while (morpho_tag + '({0})'.format(counter)) in homonyms[cur_word]: counter += 1 morpho_tag += '({0})'.format(counter) homonyms[cur_word][morpho_tag] = variants_of_accents[ind] else: homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind] else: if len(lemmas) == 1: lemma_morpho = morph.parse(list(lemmas)[0])[0] if str(lemma_morpho.methods_stack[0][0]) == '<DictionaryAnalyzer>': lexeme_counter = 0 for it in lemma_morpho.lexeme: if it.word == cur_word: lexeme_counter += 1 if lexeme_counter == 1: simple_wordsforms.append(variants_of_accents[0]) del homonyms[cur_word] else: for ind in range(len(variants_of_accents)): variant_name = '{0:>0{1}}'.format(ind + 1, str_width) homonyms[cur_word][variant_name] = variants_of_accents[ind] else: for ind in range(len(variants_of_accents)): homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind] else: for ind in range(len(variants_of_accents)): homonyms[cur_word]['{0:>0{1}}'.format(ind + 1, str_width)] = variants_of_accents[ind] else: simple_wordsforms.append(variants_of_accents[0]) print('`{0}`: dictionary has been loaded from this file...'.format(source_dictionary_name)) with codecs.open(prepared_dictionary_name, mode='w', encoding='utf-8', errors='ignore') as fp: json.dump([homonyms, sorted(simple_wordsforms)], fp, ensure_ascii=False, indent=4, sort_keys=True) print('`{0}`: dictionary has been saved into this file...'.format(prepared_dictionary_name)) print('Number of homonyms is {0}.'.format(len(homonyms))) else: print("Usage: input_dictionary.txt prepared_dictionary.json")
def __init__(self, n_endings=3, lower=True): self.grammeme_vectorizer_input = GrammemsVectorizer() self.grammeme_vectorizer_output = GrammemsVectorizer() self.endings_vectorizer = EndingsVectorizer(n_endings, lower) self.morph = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud14')