예제 #1
0
    def process_text(text: str, stress_predictor) -> 'Markup':
        """
        Получение начального варианта разметки по слогам и ударениям.

        :param text: текст для разметки
        :param stress_predictor: предсказатель ударений.
        :return markup: разметка по слогам и ударениям
        """
        from rupo.g2p.graphemes import Graphemes
        begin_line = 0
        lines = []
        words = []
        text_lines = text.split("\n")
        for text_line in text_lines:
            tokens = [token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD]
            for token in tokens:
                word = Word(begin_line + token.begin, begin_line + token.end, token.text,
                            Graphemes.get_syllables(token.text))
                # Проставляем ударения.
                stresses = stress_predictor.predict(token.text)
                # Сопоставляем ударения слогам.
                if len(word.syllables) > 1:
                    word.set_stresses(stresses)
                words.append(word)
            end_line = begin_line + len(text_line)
            lines.append(Line(begin_line, end_line, text_line, words))
            words = []
            begin_line = end_line + 1
        return Markup(text, lines)
예제 #2
0
 def inflate_vocab(self, top_n=None) -> None:
     """
     Получение словаря с ударениями по этому словарю.
     :param top_n: сколько первых записей взять?
     """
     vocab = Vocabulary(GENERATOR_VOCAB_PATH)
     stress_predictor = CombinedStressPredictor()
     forms = self.word_forms
     if top_n is not None:
         forms = forms[:top_n]
     for index, word_form in tqdm(enumerate(forms), desc="Accenting words"):
         text = word_form.text
         stresses = stress_predictor.predict(text)
         word = Word(-1, -1, text, Graphemes.get_syllables(text))
         word.set_stresses(stresses)
         vocab.add_word(word, index)
     vocab.save()
예제 #3
0
    def from_raw(self, text: str) -> 'Markup':
        """
        Импорт из сырого текста с ударениями в конце слов

        :param text: текст.
        :return: разметка.
        """

        pos = 0
        lines = []
        for line in text.split("\n"):
            if line == "":
                continue
            line_tokens = []
            for word in line.split(" "):
                i = -1
                ch = word[i]
                stress = ""
                while ch.isdigit() or ch == "-":
                    stress += ch
                    i -= 1
                    ch = word[i]
                line_tokens.append((word[:i + 1], int(stress[::-1])))
            words = []
            line_begin = pos
            for pair in line_tokens:
                token = pair[0]
                stress = pair[1]
                from rupo.g2p.graphemes import Graphemes
                syllables = Graphemes.get_syllables(token)
                for j in range(len(syllables)):
                    syllables[j].begin += pos
                    syllables[j].end += pos
                word = Word(pos, pos + len(token), token, syllables)
                word.set_stresses([stress])
                words.append(word)
                pos += len(token) + 1
            lines.append(
                Line(line_begin, pos,
                     " ".join([pair[0] for pair in line_tokens]), words))
        self.text = "\n".join([line.text for line in lines])
        self.lines = lines
        return self
예제 #4
0
 def inflate_vocab(self, dump_path, top_n=None) -> None:
     """
     Получение словаря с ударениями по этому словарю.
     
     :param top_n: сколько первых записей взять?
     :param dump_path: путь, куда сохранить словарь.
     """
     from rupo.main.vocabulary import Vocabulary
     from rupo.stress.predictor import CombinedStressPredictor
     vocab = Vocabulary(dump_path)
     stress_predictor = CombinedStressPredictor()
     forms = self.word_forms
     if top_n is not None:
         forms = forms[:top_n]
     for index, word_form in tqdm(enumerate(forms), desc="Accenting words"):
         text = word_form.text
         stresses = stress_predictor.predict(text)
         word = Word(-1, -1, text, Graphemes.get_syllables(text))
         word.set_stresses(stresses)
         vocab.add_word(word, index)
     vocab.save()
예제 #5
0
파일: api.py 프로젝트: che1974/rupo
 def count_syllables(word: str) -> int:
     """
     :param word: слово.
     :return: количество слогов в нём.
     """
     return len(Graphemes.get_syllables(word))
예제 #6
0
파일: api.py 프로젝트: che1974/rupo
 def get_word_syllables(word: str) -> List[str]:
     """
     :param word: слово.
     :return: его слоги.
     """
     return [syllable.text for syllable in Graphemes.get_syllables(word)]
예제 #7
0
    def test_syllables(self):
        checks = {
            'я': [Syllable(0, 1, 0, 'я')],
            'в': [],
            'лдж': [],
            'кронв': [Syllable(0, 5, 0, 'кронв')],
            'он': [Syllable(0, 2, 0, 'он')],
            'когда': [Syllable(0, 2, 0, 'ко'),
                      Syllable(2, 5, 1, 'гда')],
            'майка': [Syllable(0, 3, 0, 'май'),
                      Syllable(3, 5, 1, 'ка')],
            'сонька': [Syllable(0, 4, 0, 'сонь'),
                       Syllable(4, 6, 1, 'ка')],
            'соломка': [
                Syllable(0, 2, 0, 'со'),
                Syllable(2, 5, 1, 'лом'),
                Syllable(5, 7, 2, 'ка')
            ],
            'изжить': [Syllable(0, 1, 0, 'и'),
                       Syllable(1, 6, 1, 'зжить')],
            'виться': [Syllable(0, 2, 0, 'ви'),
                       Syllable(2, 6, 1, 'ться')],
            'данный': [Syllable(0, 2, 0, 'да'),
                       Syllable(2, 6, 1, 'нный')],
            'марка': [Syllable(0, 3, 0, 'мар'),
                      Syllable(3, 5, 1, 'ка')],
            'зорька': [Syllable(0, 4, 0, 'зорь'),
                       Syllable(4, 6, 1, 'ка')],
            'банка': [Syllable(0, 3, 0, 'бан'),
                      Syllable(3, 5, 1, 'ка')],
            'банька': [Syllable(0, 4, 0, 'бань'),
                       Syllable(4, 6, 1, 'ка')],
            'лайка': [Syllable(0, 3, 0, 'лай'),
                      Syllable(3, 5, 1, 'ка')],
            'оттечь': [Syllable(0, 1, 0, 'о'),
                       Syllable(1, 6, 1, 'ттечь')],
            'дяденька': [
                Syllable(0, 2, 0, 'дя'),
                Syllable(2, 6, 1, 'день'),
                Syllable(6, 8, 2, 'ка')
            ],
            'подъезд': [Syllable(0, 2, 0, 'по'),
                        Syllable(2, 7, 1, 'дъезд')],
            'морские': [
                Syllable(0, 3, 0, 'мор'),
                Syllable(3, 6, 1, 'ски'),
                Syllable(6, 7, 2, 'е')
            ],
            'мерзкие': [
                Syllable(0, 3, 0, 'мер'),
                Syllable(3, 6, 1, 'зки'),
                Syllable(6, 7, 2, 'е')
            ],
            'полный': [Syllable(0, 2, 0, 'по'),
                       Syllable(2, 6, 1, 'лный')],
            'зародыш': [
                Syllable(0, 2, 0, 'за'),
                Syllable(2, 4, 1, 'ро'),
                Syllable(4, 7, 2, 'дыш')
            ],
            'война': [Syllable(0, 3, 0, 'вой'),
                      Syllable(3, 5, 1, 'на')],
            'когда-нибудь': [
                Syllable(0, 2, 0, 'ко'),
                Syllable(2, 5, 1, 'гда'),
                Syllable(6, 8, 2, 'ни'),
                Syllable(8, 12, 3, 'будь')
            ],
        }

        for word, borders in checks.items():
            self.assertEqual(Graphemes.get_syllables(word), borders)
예제 #8
0
 def __init__(self, text: str, stresses: Set[Stress]) -> None:
     self.stresses = stresses
     self.text = text
     self.syllables = Graphemes.get_syllables(text)
     self.__accent_syllables()