示例#1
0
class DictStressPredictor(StressPredictor):
    def __init__(self,
                 language="ru",
                 raw_dict_path=None,
                 trie_path=None,
                 zalyzniak_dict=ZALYZNYAK_DICT,
                 cmu_dict=CMU_DICT):
        self.stress_dict = StressDict(language,
                                      raw_dict_path=raw_dict_path,
                                      trie_path=trie_path,
                                      zalyzniak_dict=zalyzniak_dict,
                                      cmu_dict=cmu_dict)

    def predict(self, word: str) -> List[int]:
        """
        Определение ударения в слове по словарю. Возможно несколько вариантов ударения.

        :param word: слово для простановки ударений.
        :return stresses: позиции букв, на которые падает ударение.
        """
        stresses = []
        if count_vowels(word) == 0:
            # Если гласных нет, то и ударений нет.
            pass
        elif count_vowels(word) == 1:
            # Если одна гласная, то на неё и падает ударение.
            stresses.append(get_first_vowel_position(word))
        elif word.find("ё") != -1:
            # Если есть буква "ё", то только на неё может падать ударение.
            stresses.append(word.find("ё"))
        else:
            # Проверяем словарь на наличие форм с ударениями.
            stresses = self.stress_dict.get_stresses(word, Stress.Type.PRIMARY) +\
                       self.stress_dict.get_stresses(word, Stress.Type.SECONDARY)
            if 'е' not in word:
                return stresses
            # Находим все возможные варинаты преобразований 'е' в 'ё'.
            positions = [i for i in range(len(word)) if word[i] == 'е']
            beam = [word[:positions[0]]]
            for i in range(len(positions)):
                new_beam = []
                for prefix in beam:
                    n = positions[i +
                                  1] if i + 1 < len(positions) else len(word)
                    new_beam.append(prefix + 'ё' + word[positions[i] + 1:n])
                    new_beam.append(prefix + 'е' + word[positions[i] + 1:n])
                    beam = new_beam
            # И проверяем их по словарю.
            for permutation in beam:
                if len(self.stress_dict.get_stresses(permutation)) != 0:
                    yo_pos = permutation.find("ё")
                    if yo_pos != -1:
                        stresses.append(yo_pos)
        return stresses
示例#2
0
 def __init__(self,
              language="ru",
              raw_dict_path=None,
              trie_path=None,
              zalyzniak_dict=ZALYZNYAK_DICT,
              cmu_dict=CMU_DICT):
     self.stress_dict = StressDict(language,
                                   raw_dict_path=raw_dict_path,
                                   trie_path=trie_path,
                                   zalyzniak_dict=zalyzniak_dict,
                                   cmu_dict=cmu_dict)
示例#3
0
 def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path,
                               g2p_model):
     from rupo.g2p.rnn import RNNG2PModel
     from rupo.g2p.aligner import Aligner
     from rupo.stress.dict import StressDict
     g2p_predictor = RNNG2PModel(g2p_dict_path)
     g2p_predictor.load(g2p_model)
     aligner = Aligner()
     grapheme_stress_dict_path = os.path.join(
         os.path.dirname(os.path.abspath(source_file)),
         "ru_grapheme_stress.txt")
     ZalyzniakDict.convert_to_accent_only(source_file,
                                          grapheme_stress_dict_path)
     d = StressDict(raw_dict_path=grapheme_stress_dict_path)
     vowels = set(Phonemes.VOWELS)
     with open(destination_file, 'w', encoding='utf-8') as w:
         samples = 0
         for word, accents in d.get_all():
             primary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.PRIMARY
             ]
             secondary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.SECONDARY
             ]
             phonemes = g2p_predictor.predict([word])[0]
             g, p = aligner.align(word, phonemes)
             primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict)
             secondary = ZalyzniakDict.align_stresses(
                 g, p, secondary_in_dict)
             is_valid = True
             for stress in primary + secondary:
                 if p[stress] not in vowels:
                     print(g, p, stress, p[stress])
                     is_valid = False
             if is_valid:
                 w.write(phonemes + "\t" +
                         ",".join([str(i) for i in primary]) + "\t" +
                         ",".join([str(i) for i in secondary]) + "\n")
             samples += 1
             if samples % 1000 == 0:
                 print(samples)
示例#4
0
 def setUpClass(cls):
     cls.dict = StressDict(language="ru",
                           zalyzniak_dict=ZALYZNYAK_DICT,
                           raw_dict_path=RU_GRAPHEME_STRESS_PATH,
                           trie_path=RU_GRAPHEME_STRESS_TRIE_PATH)