示例#1
0
def main():
    if sys.argv.__len__() > 1:
        init_dir_name = os.path.normpath(sys.argv[1])
        assert os.path.isdir(init_dir_name), 'Directory `{0}` does not exist!'.format(init_dir_name)
        all_prompts = sorted(list(get_all_prompts(init_dir_name)))
        accentor = Accentor()
        morpho_predictor = RNNMorphPredictor()
        i = 0
        for cur_prompt in all_prompts[:100]:
            trouble = False
            unknown_words = []
            for cur_subsentence in select_subsentences(cur_prompt):
                morphotags = ['{0} {1}'.format(cur_morpho.pos, cur_morpho.tag)
                              for cur_morpho in morpho_predictor.predict_sentence_tags(cur_subsentence)]
                accent_variants = accentor.do_accents(cur_subsentence, morphotags)
                if len(accent_variants) > 1:
                    trouble = True
                else:
                    accented_phrase = accent_variants[0]
                    for cur_word in accented_phrase:
                        vowels_counter = 0
                        for cur_char in cur_word.lower():
                            if cur_char in VOWEL_LETTERS:
                                vowels_counter += 1
                        if '+' not in cur_word and vowels_counter > 1:
                            unknown_words += [cur_word]
            if trouble:
                print('`{0}`: this phrase cannot be unambiguously accented!'.format(cur_prompt))
                i += 1
            if unknown_words:
                for unknown_word in list(set(unknown_words)):
                    print('`{0}`: word `{1}` in this this phrase is unknown!'.format(cur_prompt, unknown_word))
        print(i)
    else:
        print("Usage: input_directory_with_voxforge_ru")
示例#2
0
    def __init__(self, batch_size=1):
        """[summary]

        Args:
            batch_size (int, optional): [description]. Defaults to 1.
        """

        self.batch_size = batch_size
        self.predictor = RNNMorphPredictor(language="ru")
 def __init__(self):
     self._graph = tf.Graph()
     self._session = tf.Session(graph=self._graph)
     with self._session.as_default():
         with self._graph.as_default():
             self.rnnmorph = RNNMorphPredictor(language="ru")
     self.pymorphy_analyzer = pymorphy2.MorphAnalyzer()
     self.latin = re.compile("^[0-9]*[A-Za-z]+[0-9]*$")
     self.cyrillic = re.compile("[А-Яа-яЁе]+")
示例#4
0
def prepare_text(text):
    """
    """

    words = [
        w for w in nltk.word_tokenize(text, language="russian")
        if w not in punctuation
    ]

    predictor = RNNMorphPredictor(language="ru")
    morphs = predictor.predict(words)

    return ["{}_{}".format(m.normal_form, m.pos) for m in morphs]
示例#5
0
class MorphPredictor(PreProcesser):
    
    def __init__(self):
        self.rnnmorph = RNNMorphPredictor(language='ru')
    
    def translit(self, form):
        return (True, translit(form, 'ru')) if re.match(r'[a-zA-Z]+', form) else (False, form)
    
    def transform_sent(self, sent):
        
        sent = sent.copy()
        
        translit_flags, translit_forms = zip(*[self.translit(token.form) for token in sent.tokens])
        morth_forms = self.rnnmorph.predict(translit_forms)
        
        for token, morth_form, translit_flag in zip(sent.tokens, morth_forms, translit_flags):
            
            token.lemma = token.form.lower() if translit_flag else morth_form.normal_form
            token.upos = morth_form.pos
            token.feats = morth_form.tag
        
        return sent
        
    def transform_item(self, x):
        return [self.transform_sent(sent) for sent in x] 
示例#6
0
 def pos_tag(self):
     if self.language == "ru" or self.language == "en":
         os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
         predictor = RNNMorphPredictor(language=self.language)
         sentences = []
         for review in self.reviews:
             for i, sentence in enumerate(review.sentences):
                 words = [word.text for word in sentence]
                 sentences.append(words)
         sentences_forms = predictor.predict_sentences(sentences, 32, False)
         offset = 0
         for review in self.reviews:
             for i, sentence in enumerate(review.sentences):
                 forms = sentences_forms[offset + i]
                 for word_idx, form in enumerate(forms):
                     sentence[word_idx] = PosTaggedWord(
                         sentence[word_idx], form.pos, form.tag,
                         [int(j) for j in form.vector])
             offset += len(review.sentences)
         os.environ['CUDA_VISIBLE_DEVICES'] = '0'
示例#7
0
 def handle_new_messages(self):
     pr = RNNMorphPredictor()
     for collection_name in self.db.collection_names():
         if collection_name.startswith('Chat'):
             print('Обрабатываем сообщения чата', collection_name)
             new_messages = self.db[collection_name].find(
                 {'handled': {
                     "$exists": False
                 }})
             for message in new_messages:
                 handled_data = self.handle_message(message, pr)
                 self.db[collection_name].update_one(
                     {'_id': message['_id']}, {'$set': handled_data})
示例#8
0
 def handle_new_messages(self):
     pr = RNNMorphPredictor()
     self.mycursor.execute(
         "SELECT id, text FROM messages WHERE handled IS NULL")
     message_records = self.mycursor.fetchall()
     emoticons_count = 0
     words_count = 0
     for message_record in message_records:
         msg_id, text = message_record
         if text:
             emoticons = regex.findall(r'\X', text)
             for emoticon in emoticons:
                 if any(char in emoji.UNICODE_EMOJI for char in emoticon):
                     self.mycursor.execute(
                         f"INSERT INTO emoticons (message_id, emoticon) VALUES (%s, %s);",
                         (msg_id, emoticon))
                     emoticons_count += 1
             sentences = []
             for sentence in re.split(r'[.!?]+', re.sub(r'[ёЁ]', 'е',
                                                        text)):
                 word_list = re.findall(
                     r'[а-яА-ЯёЁ]+-[а-яА-ЯёЁ]+|[а-яА-ЯёЁ]+', sentence)
                 if word_list:
                     sentences.append(word_list)
             if sentences:
                 pr_sentences = pr.predict_sentences(sentences=sentences)
                 for pr_sentence in pr_sentences:
                     for pr_word in pr_sentence:
                         self.mycursor.execute(
                             f"INSERT INTO words (message_id, word, normal_form, pos, tag) "
                             f"VALUES (%s, %s, %s, %s, %s);",
                             (msg_id, pr_word.word, pr_word.normal_form,
                              pr_word.pos, pr_word.tag))
                         words_count += 1
         self.mycursor.execute(
             "UPDATE messages SET handled = %s WHERE id = %s",
             (True, message_record[0]))
         self.mydb.commit()
     return len(message_records), words_count, emoticons_count
示例#9
0
def main():
    try:
        log('Loading predictor')
        global PREDICTOR
        PREDICTOR = RNNMorphPredictor(language="ru")
    except Exception as error:
        log('Can not load analyzer: "%s"', error)
        return

    server = HTTPServer((HOST, PORT), HTTPHandler)
    try:
        log('Listening http://%s:%d', HOST, PORT)
        server.serve_forever()
    except KeyboardInterrupt:
        log('Quiting')
    finally:
        server.server_close()
示例#10
0
 def __process_line(line: str, output_file: TextIO,
                    sentence_splitter: SentenceSplitter,
                    morph_predictor: RNNMorphPredictor):
     sentences = sentence_splitter.split(line)
     for sentence in sentences:
         words = [
             token.text for token in Tokenizer.tokenize(sentence) if
             token.text != '' and token.token_type != Token.TokenType.SPACE
         ]
         if not words:
             continue
         forms = morph_predictor.predict_sentence_tags(words)
         for form in forms:
             if form.pos == "PUNCT":
                 continue
             output_file.write(
                 "%s\t%s\t%s\t%s\n" %
                 (form.word, form.normal_form, form.pos, form.tag))
         output_file.write("\n")
示例#11
0
def find_rhyme(src: str, russian_lexemes: dict, rnn_morph: RNNMorphPredictor,
               phonetic_dict: Dict[str, tuple]) -> List[str]:
    russian_letters = set('АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя')
    src_words = list(filter(
        lambda it2: set(it2) <= russian_letters,
        map(lambda it1: it1.strip().lower(), word_tokenize(src))
    ))
    if len(src_words) == 0:
        return [src]
    morphotags = [get_morphodata(cur.pos + ' ' + cur.tag) for cur in rnn_morph.predict(src_words)]
    print('morphotags', morphotags)
    syllables_of_words = [str(calc_number_of_syllables(cur_word)) for cur_word in src_words]
    print('syllables_of_words', syllables_of_words)
    variants = []
    new_variant = []
    for it in select_new_variant(src_words, morphotags, syllables_of_words, russian_lexemes, phonetic_dict, 0,
                                 new_variant):
        variants.append(' '.join(it))
        del it
    return variants
示例#12
0
    def get_morph_markup(input_filenames: List[str], output_filename: str):
        """
        Разметка по грамматическим значениям

        :param input_filenames: входные текстовые файлы
        :param output_filename: путь к файлу, куда будет сохранена разметка
        """
        if os.path.exists(output_filename):
            os.remove(output_filename)

        sentence_splitter = SentenceSplitter(language='ru')
        morph_predictor = RNNMorphPredictor()

        for filename in input_filenames:
            with open(filename, "r",
                      encoding="utf-8") as r, open(output_filename,
                                                   "w+",
                                                   encoding="utf-8") as w:
                for line in r:
                    Morph.__process_line(line, w, sentence_splitter,
                                         morph_predictor)
示例#13
0
class Preprocessor():

    def __init__(self):
        self.predictor = RNNMorphPredictor()

    def gettags(self, text):
        analysis = self.predictor.predict_sentence_tags(text)
        words_and_tags = []
        for word in analysis:
            word_and_tag = []
            word_and_tag.append(word.word)
            word_and_tag.append(word.pos + ' ' + word.tag)
            words_and_tags.append(word_and_tag)
        return words_and_tags

    def preprocessing(self, text):
        text = sub('[\.\,\?\!\(\);:]+', ' <sil>', text)
        text = sub(' [–-] |\n', ' <sil> ', text)
        text = sub('\s{2,}', ' ', text)
        text = sub('^\s|[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '', text)
        words_and_tags = self.gettags(text.split(' '))
        return words_and_tags
示例#14
0
def tag(predictor: RNNMorphPredictor, untagged_filename: str,
        tagged_filename: str):
    sentences = []
    with open(untagged_filename, "r", encoding='utf-8') as r:
        words = []
        for line in r:
            if line != "\n":
                records = line.strip().split("\t")
                word = records[1]
                words.append(word)
            else:
                sentences.append([word.lower() for word in words])
                words = []
    with open(tagged_filename, "w", encoding='utf-8') as w:
        all_forms = predictor.predict_sentences_tags(sentences)
        for forms in all_forms:
            for i, form in enumerate(forms):
                line = "{}\t{}\t{}\t{}\t{}\n".format(str(i + 1), form.word,
                                                     form.normal_form,
                                                     form.pos, form.tag)
                w.write(line)
            w.write("\n")
示例#15
0
class TaggerEnsemble:
    def __init__(self):
        self.predictor = RNNMorphPredictor(language="ru")

        self.tagger = rupostagger.RuPosTagger()
        self.tagger.load()

        #model_file = '/home/inkoziev/polygon/GramEval2020/tmp/udpipe_syntagrus.model'
        #self.ud_model = Model.load(model_file)
        #self.ud_pipeline = Pipeline(self.ud_model, 'vertical', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        #self.ud_error = ProcessingError()

    def tag(self, words):
        tokens1 = self.tagger.tag(words)
        tokens2 = self.predictor.predict(words)

        #processed = self.ud_pipeline.process('\n'.join(words), self.ud_error)
        #if self.ud_error.occurred():
        #    print("An error occurred when running run_udpipe: ")
        #    print(self.ud_error.message)
        #    return tokens1
        #tokens3 = pyconll.load_from_string(processed)[0]

        new_tokens = []
        for token1, token2 in zip(tokens1, tokens2):
            tags1 = token1[1].split('|')
            if tags1[0] == 'NOUN' and 'Case' in token2.tag:
                tags_rnn = dict(
                    z.split('=') for z in token2.tag.split('|') if '=' in z)
                new_tagset = list(
                    filter(lambda z: not z.startswith('Case'), tags1))
                new_tagset.append(('Case=' + tags_rnn['Case']))
                new_tokens.append((token1[0], '|'.join(new_tagset)))
            else:
                new_tokens.append(token1)

        return new_tokens
示例#16
0
 def __init__(self):
     self.predictor = RNNMorphPredictor()
示例#17
0
#!/usr/bin/env python
# coding: utf-8

from nltk import sent_tokenize, word_tokenize
import ufal.udpipe
from rnnmorph.predictor import RNNMorphPredictor
from udpipe_model import Model

predictor = RNNMorphPredictor(language="ru")

# Download model from https://rusvectores.org/static/models/udpipe_syntagrus.model
model_file = 'udpipe_syntagrus.model'
model = Model(model_file)


def to_conllu(wordforms):
    lines = []
    for i in range(len(wordforms)):
        line = [
            str(i + 1), wordforms[i].word, wordforms[i].normal_form,
            wordforms[i].pos, wordforms[i].tag
        ]
        lines.append('\t'.join(line + ['_'] * 5))
    return '\n'.join(lines)


def pipeline(sentence):
    tokens = word_tokenize(sentence)
    forms = predictor.predict(tokens)
    sentences = model.read(to_conllu(forms), 'conllu')
    for s in sentences:
示例#18
0
 def __init__(self):
     self.rnnmorph = RNNMorphPredictor(language='ru')
示例#19
0
 def setUpClass(cls):
     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
     cls.predictor = RNNMorphPredictor()
示例#20
0
    def __init__(self):
        self.predictor = RNNMorphPredictor(language="ru")

        self.tagger = rupostagger.RuPosTagger()
        self.tagger.load()
# In[33]:

true_pred = sum((1 for t1, t2 in zip(nltk_result, test_sents)
                 for tt1, tt2 in zip(t1, t2) if tt1[1] == tt2[1]))
num_pred = sum(1 for s in test_sents for _ in s)
print(f"{true_pred / num_pred * 100:.1f}")

# In[34]:

get_ipython().run_cell_magic('capture', '', '!pip install -q rnnmorph')

# In[35]:

from rnnmorph.predictor import RNNMorphPredictor
predictor = RNNMorphPredictor(language="en")

# In[36]:

rnnmorph_result = predictor.predict_sentences(
    [list(map(lambda t: t[0], s)) for s in test_sents])

# In[37]:

true_pred = sum((1 for t1, t2 in zip(rnnmorph_result, test_sents)
                 for tt1, tt2 in zip(t1, t2) if tt1.pos == tt2[1]))
num_pred = sum(1 for s in test_sents for _ in s)
print(f"{true_pred / num_pred * 100:.1f}")

# ### Вопрос 7:
# * Какое качество вы получили, используя каждую из двух библиотек? Сравните их результаты.
示例#22
0
class Preprocessor():
    def __init__(self, batch_size=1):
        self.batch_size = batch_size
        self.predictor = RNNMorphPredictor(language="ru")

    def __del__(self):
        if hasattr(self, 'predictor'):
            del self.predictor

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def __deepcopy__(self, memodict={}):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def gettags(self, texts):
        if not isinstance(texts, list):
            raise ValueError('Expected `{0}`, but got `{1}`.'.format(
                type([1, 2]), type(texts)))
        if len(texts) == 0:
            return []
        all_phonetic_phrases = []
        all_phrases_for_rnnmorph = []
        for cur_text in texts:
            list_of_phonetic_phrases = [
                cur.strip() for cur in ' '.join(cur_text).split('<sil>')
            ]
            united_phrase_for_rnnmorph = []
            for phonetic_phrase in list_of_phonetic_phrases:
                if len(phonetic_phrase) > 0:
                    united_phrase_for_rnnmorph += phonetic_phrase.split()
            if len(united_phrase_for_rnnmorph) > 0:
                all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph)
                all_phonetic_phrases.append(list_of_phonetic_phrases)
            else:
                all_phonetic_phrases.append([])
        if len(all_phrases_for_rnnmorph) > 0:
            all_forms = self.predictor.predict_sentences(
                all_phrases_for_rnnmorph, batch_size=self.batch_size)
        else:
            all_forms = []
        all_words_and_tags = []
        phrase_ind = 0
        for cur in all_phonetic_phrases:
            words_and_tags = [['<sil>', 'SIL _']]
            if len(cur) > 0:
                token_ind = 0
                for phonetic_phrase in cur:
                    if len(phonetic_phrase) > 0:
                        n = len(phonetic_phrase.split(' '))
                        analysis = all_forms[phrase_ind][token_ind:(token_ind +
                                                                    n)]
                        for word in analysis:
                            word_and_tag = []
                            word_and_tag.append(word.word)
                            word_and_tag.append(word.pos + ' ' + word.tag)
                            words_and_tags.append(word_and_tag)
                        words_and_tags.append(['<sil>', 'SIL _'])
                        token_ind += n
                phrase_ind += 1
            all_words_and_tags.append(words_and_tags)
        return all_words_and_tags

    def preprocessing(self, texts):
        def prepare(src):
            dst = sub('[\.\,\?\!\(\);:]+', ' <sil>', src.lower())
            dst = sub(' [–-] |\n', ' <sil> ', dst)
            dst = sub('\s{2,}', ' ', dst)
            dst = sub('^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '',
                      dst)
            return dst.strip().split(' ')

        words_and_tags = self.gettags([prepare(cur) for cur in texts])
        return words_and_tags
示例#23
0
class Preprocessor():
    """[summary]
    """
    def __init__(self, batch_size=1):
        """[summary]

        Args:
            batch_size (int, optional): [description]. Defaults to 1.
        """

        self.batch_size = batch_size
        self.predictor = RNNMorphPredictor(language="ru")

    def __del__(self):
        if hasattr(self, 'predictor'):
            del self.predictor

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def __deepcopy__(self, memodict={}):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def gettags(self, texts: list) -> list:
        """Get morpho tags for the `texts`

        Args:
            texts (list): List of lists

        Raises:
            ValueError: [description]

        Returns:
            list: list of lists -- words and motpho tags
            
        Example:            
            PreProcess.gettags([['я купил самолёт и ракеты'], ['ух ты']])
            [[['<sil>', 'SIL _'],
              ['я', 'PRON Case=Nom|Number=Sing|Person=1'],
              ['купил', 'VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'],
              ['самолёт', 'NOUN Case=Acc|Gender=Masc|Number=Sing'],
              ['и', 'CONJ _'],
              ['ракеты', 'NOUN Case=Acc|Gender=Fem|Number=Plur'],
              ['<sil>', 'SIL _']],
             [['<sil>', 'SIL _'],
              ['ух', 'INTJ _'],
              ['ты', 'PRON Case=Nom|Number=Sing|Person=2'],
              ['<sil>', 'SIL _']]]
        """

        if not isinstance(texts, list):
            raise ValueError(
                f'Expected `{type([1, 2])}`, but got `{type(texts)}`.')
        if len(texts) == 0:
            return []
        all_phonetic_phrases = []
        all_phrases_for_rnnmorph = []
        for cur_text in texts:
            list_of_phonetic_phrases = [
                cur.strip() for cur in ' '.join(cur_text).split('<sil>')
            ]
            united_phrase_for_rnnmorph = []
            for phonetic_phrase in list_of_phonetic_phrases:
                if len(phonetic_phrase) > 0:
                    united_phrase_for_rnnmorph += phonetic_phrase.split()
            if len(united_phrase_for_rnnmorph) > 0:
                all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph)
                all_phonetic_phrases.append(list_of_phonetic_phrases)
            else:
                all_phonetic_phrases.append([])
        if len(all_phrases_for_rnnmorph) > 0:
            all_forms = self.predictor.predict_sentences(all_phrases_for_rnnmorph, \
                                                         batch_size=self.batch_size)
        else:
            all_forms = []
        all_words_and_tags = []
        phrase_ind = 0

        for cur in all_phonetic_phrases:
            words_and_tags = [['<sil>', 'SIL _']]
            if len(cur) > 0:
                token_ind = 0
                for phonetic_phrase in cur:
                    if len(phonetic_phrase) > 0:
                        n = len(phonetic_phrase.split(' '))
                        analysis = all_forms[phrase_ind][token_ind:(token_ind +
                                                                    n)]
                        for word in analysis:
                            word_and_tag = []
                            word_and_tag.append(word.word)
                            word_and_tag.append(word.pos + ' ' + word.tag)
                            words_and_tags.append(word_and_tag)
                        words_and_tags.append(['<sil>', 'SIL _'])
                        token_ind += n
                phrase_ind += 1
            all_words_and_tags.append(words_and_tags)
        return all_words_and_tags

    def __call__(self, texts: str):
        """Call the instance like function. Use in pipelines, too."""
        return self.preprocessing(texts)[0]

    def preprocessing(self, texts: str):
        """[summary]

        Args:
            texts (str): Text to preprocess.

        Returns:
            list: A list of processed words and tags.
        """
        def prepare(text: str) -> str:
            """Replace punctuation marks with <sil> tag; remove special symbols."""

            text = sub(r'[\.\,\?\!\(\);:]+', ' <sil>', text.lower())
            text = sub(r' [–-] |\n', ' <sil> ', text)
            text = sub(r'\s{2,}', ' ', text)
            text = sub(r'^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '',
                       text)
            return text.strip().split(' ')

        return self.gettags([prepare(cur) for cur in texts])
示例#24
0
from rnnmorph.predictor import RNNMorphPredictor
from pprint import pprint

if __name__ == '__main__':
    pr = RNNMorphPredictor(language='ru')
    forms = pr.predict(words=['мама', 'мыла', 'раму'])
    for i in forms:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    forms = pr.predict_sentences(sentences=[['Всем', 'привет']])
    for i in forms[0]:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    pprint(forms)
class RNNMorphWrapper:
    """
    Класс предназначен для получения граммемной информации о токенах.
    """
    def __init__(self):
        self._graph = tf.Graph()
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default():
            with self._graph.as_default():
                self.rnnmorph = RNNMorphPredictor(language="ru")
        self.pymorphy_analyzer = pymorphy2.MorphAnalyzer()
        self.latin = re.compile("^[0-9]*[A-Za-z]+[0-9]*$")
        self.cyrillic = re.compile("[А-Яа-яЁе]+")

    def _choose_pymorphy_form(self, word, lemma, pos):
        hypotheses = self.pymorphy_analyzer.parse(word)
        hyp = None
        tags_to_add = {}
        other = ""
        for hyp in hypotheses:
            if hyp.normal_form == lemma:
                break
        changed_lemma = lemma.replace("ё", "е")
        if not hyp:
            return other, tags_to_add, changed_lemma
        str_tag = str(hyp.tag)
        if "Surn" in str_tag:
            other = "фам"
            changed_lemma = word.lower().replace("ё", "е")
        elif "Patr" in str_tag:
            other = "отч"
            changed_lemma = word.lower().replace(
                "ё", "е")  # у Петрович лемма внезапно Пётр
        if hyp.tag.transitivity:
            tags_to_add[TRANSITIVITY] = str(hyp.tag.transitivity)
        if hyp.tag.animacy and pos == "NOUN":
            tags_to_add[ANIMACY] = str(hyp.tag.animacy)
        if hyp.tag.aspect:
            tags_to_add[ASPECT] = str(hyp.tag.aspect)
        return other, tags_to_add, changed_lemma

    def _change_pos(self, token, analysis):
        if re.match(self.latin, analysis.word):
            token[GRAMMEM_INFO][PART_OF_SPEECH] = "X"
        elif analysis.pos == "PUNCT" and re.search(self.cyrillic,
                                                   analysis.word):
            token[GRAMMEM_INFO][PART_OF_SPEECH] = "X"
        else:
            token[GRAMMEM_INFO][PART_OF_SPEECH] = analysis.pos
        return token

    def _gram_info_processing(self, tags_to_add, analysis):
        gramme_info = {}
        raw_gram_data = []
        if analysis.tag != "_":
            for tag in analysis.tag.split("|"):
                gramme_info[tag.split("=")[0].lower()] = tag.split(
                    "=")[1].lower()
            gramme_info.update(tags_to_add)
        sorted_gramme_info = {
            key: gramme_info[key]
            for key in sorted(gramme_info.keys())
        }
        for key in sorted_gramme_info:
            raw_gram_data.append(key + "=" + sorted_gramme_info[key])
        raw_gram_info = "|".join(raw_gram_data)
        return sorted_gramme_info, raw_gram_info

    def _rnnmorph_to_token_dicti(self, token, analysis):
        additional_info, tags_to_add, changed_lemma = self._choose_pymorphy_form(
            analysis.word, analysis.normal_form, analysis.pos)
        sorted_gramme_info, raw_gram_info = self._gram_info_processing(
            tags_to_add, analysis)
        token[GRAMMEM_INFO] = sorted_gramme_info
        token[GRAMMEM_INFO][RAW_GRAM_INFO] = raw_gram_info
        if additional_info:
            token[GRAMMEM_INFO][OTHER] = additional_info
        token = self._change_pos(token, analysis)
        token[LEMMA] = changed_lemma
        return token

    def token_desc_list_processing(self, token_desc_list):
        """
        Получить список токенов с описанием
        :param: Список из словарей
        :return: Список из словарей, обогащенный морфологической информацией
        """
        raw_token_list = [token[TEXT] for token in token_desc_list]
        with self._session.as_default():
            with self._graph.as_default():
                analyze_result = self.rnnmorph.predict(raw_token_list)

        res = []
        for i in range(len(token_desc_list)):
            analysis = analyze_result[i]
            tokenized_element = token_desc_list[i]
            final_tokenized_element = self._rnnmorph_to_token_dicti(
                tokenized_element, analysis)
            res.append(final_tokenized_element)
        return res

    def __call__(self, token_desc_list):
        """
        Класс предназначен для забора из RNNMorph + pymorphy2 граммемной информации.
        На вход принимается список токенов
        На выходе имеем список токенов с проставленными грамматическими атрибутами

        :param token_desc_list (list of dicts)
        :return: final_result (enriched list of dicts)
        """
        final_result = []
        sentences = token_list_to_sentences(token_desc_list)
        for sentence in sentences:
            final_result.extend(self.token_desc_list_processing(sentence))
            if final_result:
                final_result.append({
                    TEXT:
                    ".",
                    LEMMA:
                    ".",
                    TOKEN_TYPE:
                    SENTENCE_ENDPOINT_TOKEN,
                    TOKEN_VALUE: {
                        VALUE: "."
                    },
                    LIST_OF_TOKEN_TYPES_DATA: [{
                        TOKEN_TYPE: SENTENCE_ENDPOINT_TOKEN,
                        TOKEN_VALUE: {
                            VALUE: "."
                        }
                    }]
                })
        return final_result
示例#26
0
 def __init__(self, batch_size=1):
     self.batch_size = batch_size
     self.predictor = RNNMorphPredictor(language="ru")
示例#27
0
def get_morph_predictor():
    """ """
    return RNNMorphPredictor(language="ru")