示例#1
0
    def process_text(text: str, stress_predictor) -> 'Markup':
        """
        Получение начального варианта разметки по слогам и ударениям.

        :param text: текст для разметки
        :param stress_predictor: предсказатель ударений.
        :return markup: разметка по слогам и ударениям
        """
        begin_line = 0
        lines = []
        words = []
        text_lines = text.split("\n")
        for text_line in text_lines:
            tokens = [
                token for token in Tokenizer.tokenize(text_line)
                if token.token_type == Token.TokenType.WORD
            ]
            for token in tokens:
                word = Word(begin_line + token.begin, begin_line + token.end,
                            token.text, get_syllables(token.text))
                # Проставляем ударения.
                stresses = stress_predictor.predict(token.text.lower())
                # Сопоставляем ударения слогам.
                if len(word.syllables) > 1:
                    word.set_stresses(stresses)
                words.append(word)
            end_line = begin_line + len(text_line)
            lines.append(Line(begin_line, end_line, text_line, words))
            words = []
            begin_line = end_line + 1
        return Markup(text, lines)
示例#2
0
    def from_raw(self, text: str) -> 'Markup':
        """
        Импорт из сырого текста с ударениями в конце слов

        :param text: текст.
        :return: разметка.
        """

        pos = 0
        lines = []
        for line in text.split("\n"):
            if line == "":
                continue
            line_tokens = []
            for word in line.split(" "):
                i = -1
                ch = word[i]
                stress = ""
                while ch.isdigit() or ch == "-":
                    stress += ch
                    i -= 1
                    ch = word[i]
                line_tokens.append((word[:i + 1], int(stress[::-1])))
            words = []
            line_begin = pos
            for pair in line_tokens:
                token = pair[0]
                stress = pair[1]
                syllables = get_syllables(token)
                for j in range(len(syllables)):
                    syllables[j].begin += pos
                    syllables[j].end += pos
                word = Word(pos, pos + len(token), token, syllables)
                word.set_stresses([stress])
                words.append(word)
                pos += len(token) + 1
            lines.append(
                Line(line_begin, pos,
                     " ".join([pair[0] for pair in line_tokens]), words))
        self.text = "\n".join([line.text for line in lines])
        self.lines = lines
        return self
示例#3
0
 def count_syllables(word: str) -> int:
     """
     :param word: слово.
     :return: количество слогов в нём.
     """
     return len(get_syllables(word))
示例#4
0
 def get_word_syllables(word: str) -> List[str]:
     """
     :param word: слово.
     :return: его слоги.
     """
     return [syllable.text for syllable in get_syllables(word)]
示例#5
0
 def __init__(self, text: str, stresses: Set[Stress]) -> None:
     self.stresses = stresses
     self.text = text
     self.syllables = get_syllables(text)
     self.__accent_syllables()