class DictStressPredictor(StressPredictor): def __init__(self, language="ru", raw_dict_path=None, trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT): self.stress_dict = StressDict(language, raw_dict_path=raw_dict_path, trie_path=trie_path, zalyzniak_dict=zalyzniak_dict, cmu_dict=cmu_dict) def predict(self, word: str) -> List[int]: """ Определение ударения в слове по словарю. Возможно несколько вариантов ударения. :param word: слово для простановки ударений. :return stresses: позиции букв, на которые падает ударение. """ stresses = [] if count_vowels(word) == 0: # Если гласных нет, то и ударений нет. pass elif count_vowels(word) == 1: # Если одна гласная, то на неё и падает ударение. stresses.append(get_first_vowel_position(word)) elif word.find("ё") != -1: # Если есть буква "ё", то только на неё может падать ударение. stresses.append(word.find("ё")) else: # Проверяем словарь на наличие форм с ударениями. stresses = self.stress_dict.get_stresses(word, Stress.Type.PRIMARY) +\ self.stress_dict.get_stresses(word, Stress.Type.SECONDARY) if 'е' not in word: return stresses # Находим все возможные варинаты преобразований 'е' в 'ё'. positions = [i for i in range(len(word)) if word[i] == 'е'] beam = [word[:positions[0]]] for i in range(len(positions)): new_beam = [] for prefix in beam: n = positions[i + 1] if i + 1 < len(positions) else len(word) new_beam.append(prefix + 'ё' + word[positions[i] + 1:n]) new_beam.append(prefix + 'е' + word[positions[i] + 1:n]) beam = new_beam # И проверяем их по словарю. for permutation in beam: if len(self.stress_dict.get_stresses(permutation)) != 0: yo_pos = permutation.find("ё") if yo_pos != -1: stresses.append(yo_pos) return stresses
def __init__(self, language="ru", raw_dict_path=None, trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT): self.stress_dict = StressDict(language, raw_dict_path=raw_dict_path, trie_path=trie_path, zalyzniak_dict=zalyzniak_dict, cmu_dict=cmu_dict)
def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path, g2p_model): from rupo.g2p.rnn import RNNG2PModel from rupo.g2p.aligner import Aligner from rupo.stress.dict import StressDict g2p_predictor = RNNG2PModel(g2p_dict_path) g2p_predictor.load(g2p_model) aligner = Aligner() grapheme_stress_dict_path = os.path.join( os.path.dirname(os.path.abspath(source_file)), "ru_grapheme_stress.txt") ZalyzniakDict.convert_to_accent_only(source_file, grapheme_stress_dict_path) d = StressDict(raw_dict_path=grapheme_stress_dict_path) vowels = set(Phonemes.VOWELS) with open(destination_file, 'w', encoding='utf-8') as w: samples = 0 for word, accents in d.get_all(): primary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.PRIMARY ] secondary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.SECONDARY ] phonemes = g2p_predictor.predict([word])[0] g, p = aligner.align(word, phonemes) primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict) secondary = ZalyzniakDict.align_stresses( g, p, secondary_in_dict) is_valid = True for stress in primary + secondary: if p[stress] not in vowels: print(g, p, stress, p[stress]) is_valid = False if is_valid: w.write(phonemes + "\t" + ",".join([str(i) for i in primary]) + "\t" + ",".join([str(i) for i in secondary]) + "\n") samples += 1 if samples % 1000 == 0: print(samples)
def setUpClass(cls): cls.dict = StressDict(language="ru", zalyzniak_dict=ZALYZNYAK_DICT, raw_dict_path=RU_GRAPHEME_STRESS_PATH, trie_path=RU_GRAPHEME_STRESS_TRIE_PATH)