def process_text(text: str, stress_predictor) -> 'Markup': """ Получение начального варианта разметки по слогам и ударениям. :param text: текст для разметки :param stress_predictor: предсказатель ударений. :return markup: разметка по слогам и ударениям """ begin_line = 0 lines = [] words = [] text_lines = text.split("\n") for text_line in text_lines: tokens = [ token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD ] for token in tokens: word = Word(begin_line + token.begin, begin_line + token.end, token.text, get_syllables(token.text)) # Проставляем ударения. stresses = stress_predictor.predict(token.text.lower()) # Сопоставляем ударения слогам. if len(word.syllables) > 1: word.set_stresses(stresses) words.append(word) end_line = begin_line + len(text_line) lines.append(Line(begin_line, end_line, text_line, words)) words = [] begin_line = end_line + 1 return Markup(text, lines)
def from_raw(self, text: str) -> 'Markup': """ Импорт из сырого текста с ударениями в конце слов :param text: текст. :return: разметка. """ pos = 0 lines = [] for line in text.split("\n"): if line == "": continue line_tokens = [] for word in line.split(" "): i = -1 ch = word[i] stress = "" while ch.isdigit() or ch == "-": stress += ch i -= 1 ch = word[i] line_tokens.append((word[:i + 1], int(stress[::-1]))) words = [] line_begin = pos for pair in line_tokens: token = pair[0] stress = pair[1] syllables = get_syllables(token) for j in range(len(syllables)): syllables[j].begin += pos syllables[j].end += pos word = Word(pos, pos + len(token), token, syllables) word.set_stresses([stress]) words.append(word) pos += len(token) + 1 lines.append( Line(line_begin, pos, " ".join([pair[0] for pair in line_tokens]), words)) self.text = "\n".join([line.text for line in lines]) self.lines = lines return self
def count_syllables(word: str) -> int: """ :param word: слово. :return: количество слогов в нём. """ return len(get_syllables(word))
def get_word_syllables(word: str) -> List[str]: """ :param word: слово. :return: его слоги. """ return [syllable.text for syllable in get_syllables(word)]
def __init__(self, text: str, stresses: Set[Stress]) -> None: self.stresses = stresses self.text = text self.syllables = get_syllables(text) self.__accent_syllables()