class MorphPredictor(PreProcesser): def __init__(self): self.rnnmorph = RNNMorphPredictor(language='ru') def translit(self, form): return (True, translit(form, 'ru')) if re.match(r'[a-zA-Z]+', form) else (False, form) def transform_sent(self, sent): sent = sent.copy() translit_flags, translit_forms = zip(*[self.translit(token.form) for token in sent.tokens]) morth_forms = self.rnnmorph.predict(translit_forms) for token, morth_form, translit_flag in zip(sent.tokens, morth_forms, translit_flags): token.lemma = token.form.lower() if translit_flag else morth_form.normal_form token.upos = morth_form.pos token.feats = morth_form.tag return sent def transform_item(self, x): return [self.transform_sent(sent) for sent in x]
def prepare_text(text): """ """ words = [ w for w in nltk.word_tokenize(text, language="russian") if w not in punctuation ] predictor = RNNMorphPredictor(language="ru") morphs = predictor.predict(words) return ["{}_{}".format(m.normal_form, m.pos) for m in morphs]
def find_rhyme(src: str, russian_lexemes: dict, rnn_morph: RNNMorphPredictor, phonetic_dict: Dict[str, tuple]) -> List[str]: russian_letters = set('АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя') src_words = list(filter( lambda it2: set(it2) <= russian_letters, map(lambda it1: it1.strip().lower(), word_tokenize(src)) )) if len(src_words) == 0: return [src] morphotags = [get_morphodata(cur.pos + ' ' + cur.tag) for cur in rnn_morph.predict(src_words)] print('morphotags', morphotags) syllables_of_words = [str(calc_number_of_syllables(cur_word)) for cur_word in src_words] print('syllables_of_words', syllables_of_words) variants = [] new_variant = [] for it in select_new_variant(src_words, morphotags, syllables_of_words, russian_lexemes, phonetic_dict, 0, new_variant): variants.append(' '.join(it)) del it return variants
class TaggerEnsemble: def __init__(self): self.predictor = RNNMorphPredictor(language="ru") self.tagger = rupostagger.RuPosTagger() self.tagger.load() #model_file = '/home/inkoziev/polygon/GramEval2020/tmp/udpipe_syntagrus.model' #self.ud_model = Model.load(model_file) #self.ud_pipeline = Pipeline(self.ud_model, 'vertical', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') #self.ud_error = ProcessingError() def tag(self, words): tokens1 = self.tagger.tag(words) tokens2 = self.predictor.predict(words) #processed = self.ud_pipeline.process('\n'.join(words), self.ud_error) #if self.ud_error.occurred(): # print("An error occurred when running run_udpipe: ") # print(self.ud_error.message) # return tokens1 #tokens3 = pyconll.load_from_string(processed)[0] new_tokens = [] for token1, token2 in zip(tokens1, tokens2): tags1 = token1[1].split('|') if tags1[0] == 'NOUN' and 'Case' in token2.tag: tags_rnn = dict( z.split('=') for z in token2.tag.split('|') if '=' in z) new_tagset = list( filter(lambda z: not z.startswith('Case'), tags1)) new_tagset.append(('Case=' + tags_rnn['Case'])) new_tokens.append((token1[0], '|'.join(new_tagset))) else: new_tokens.append(token1) return new_tokens
from rnnmorph.predictor import RNNMorphPredictor from pprint import pprint if __name__ == '__main__': pr = RNNMorphPredictor(language='ru') forms = pr.predict(words=['мама', 'мыла', 'раму']) for i in forms: print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag)) forms = pr.predict_sentences(sentences=[['Всем', 'привет']]) for i in forms[0]: print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag)) pprint(forms)
class RNNMorphWrapper: """ Класс предназначен для получения граммемной информации о токенах. """ def __init__(self): self._graph = tf.Graph() self._session = tf.Session(graph=self._graph) with self._session.as_default(): with self._graph.as_default(): self.rnnmorph = RNNMorphPredictor(language="ru") self.pymorphy_analyzer = pymorphy2.MorphAnalyzer() self.latin = re.compile("^[0-9]*[A-Za-z]+[0-9]*$") self.cyrillic = re.compile("[А-Яа-яЁе]+") def _choose_pymorphy_form(self, word, lemma, pos): hypotheses = self.pymorphy_analyzer.parse(word) hyp = None tags_to_add = {} other = "" for hyp in hypotheses: if hyp.normal_form == lemma: break changed_lemma = lemma.replace("ё", "е") if not hyp: return other, tags_to_add, changed_lemma str_tag = str(hyp.tag) if "Surn" in str_tag: other = "фам" changed_lemma = word.lower().replace("ё", "е") elif "Patr" in str_tag: other = "отч" changed_lemma = word.lower().replace( "ё", "е") # у Петрович лемма внезапно Пётр if hyp.tag.transitivity: tags_to_add[TRANSITIVITY] = str(hyp.tag.transitivity) if hyp.tag.animacy and pos == "NOUN": tags_to_add[ANIMACY] = str(hyp.tag.animacy) if hyp.tag.aspect: tags_to_add[ASPECT] = str(hyp.tag.aspect) return other, tags_to_add, changed_lemma def _change_pos(self, token, analysis): if re.match(self.latin, analysis.word): token[GRAMMEM_INFO][PART_OF_SPEECH] = "X" elif analysis.pos == "PUNCT" and re.search(self.cyrillic, analysis.word): token[GRAMMEM_INFO][PART_OF_SPEECH] = "X" else: token[GRAMMEM_INFO][PART_OF_SPEECH] = analysis.pos return token def _gram_info_processing(self, tags_to_add, analysis): gramme_info = {} raw_gram_data = [] if analysis.tag != "_": for tag in analysis.tag.split("|"): gramme_info[tag.split("=")[0].lower()] = tag.split( "=")[1].lower() gramme_info.update(tags_to_add) sorted_gramme_info = { key: gramme_info[key] for key in sorted(gramme_info.keys()) } for key in sorted_gramme_info: raw_gram_data.append(key + "=" + sorted_gramme_info[key]) raw_gram_info = "|".join(raw_gram_data) return sorted_gramme_info, raw_gram_info def _rnnmorph_to_token_dicti(self, token, analysis): additional_info, tags_to_add, changed_lemma = self._choose_pymorphy_form( analysis.word, analysis.normal_form, analysis.pos) sorted_gramme_info, raw_gram_info = self._gram_info_processing( tags_to_add, analysis) token[GRAMMEM_INFO] = sorted_gramme_info token[GRAMMEM_INFO][RAW_GRAM_INFO] = raw_gram_info if additional_info: token[GRAMMEM_INFO][OTHER] = additional_info token = self._change_pos(token, analysis) token[LEMMA] = changed_lemma return token def token_desc_list_processing(self, token_desc_list): """ Получить список токенов с описанием :param: Список из словарей :return: Список из словарей, обогащенный морфологической информацией """ raw_token_list = [token[TEXT] for token in token_desc_list] with self._session.as_default(): with self._graph.as_default(): analyze_result = self.rnnmorph.predict(raw_token_list) res = [] for i in range(len(token_desc_list)): analysis = analyze_result[i] tokenized_element = token_desc_list[i] final_tokenized_element = self._rnnmorph_to_token_dicti( tokenized_element, analysis) res.append(final_tokenized_element) return res def __call__(self, token_desc_list): """ Класс предназначен для забора из RNNMorph + pymorphy2 граммемной информации. На вход принимается список токенов На выходе имеем список токенов с проставленными грамматическими атрибутами :param token_desc_list (list of dicts) :return: final_result (enriched list of dicts) """ final_result = [] sentences = token_list_to_sentences(token_desc_list) for sentence in sentences: final_result.extend(self.token_desc_list_processing(sentence)) if final_result: final_result.append({ TEXT: ".", LEMMA: ".", TOKEN_TYPE: SENTENCE_ENDPOINT_TOKEN, TOKEN_VALUE: { VALUE: "." }, LIST_OF_TOKEN_TYPES_DATA: [{ TOKEN_TYPE: SENTENCE_ENDPOINT_TOKEN, TOKEN_VALUE: { VALUE: "." } }] }) return final_result