예제 #1
0
def g2p_en():
    clf = RNNG2PModel(EN_G2P_DICT_PATH,
                      40,
                      language="en",
                      rnn=LSTM,
                      units1=256,
                      dropout=0.5)
    clf.build()
    clf.train(G2P_CURRENT_MODEL_DIR, enable_checkpoints=True)
예제 #2
0
파일: api.py 프로젝트: che1974/rupo
 def get_g2p_model(self, language="ru", model_path=None):
     if self.g2p_models.get(language) is None:
         self.g2p_models[language] = RNNG2PModel(language=language)
         if language == "ru" and model_path is None:
             model_path = RU_G2P_DEFAULT_MODEL
         elif language == "en" and model_path is None:
             model_path = EN_G2P_DEFAULT_MODEL
         else:
             return None
         self.g2p_models[language].load(model_path)
     return self.g2p_models[language]
예제 #3
0
def g2p_ru():
    clf = RNNG2PModel(RU_G2P_DICT_PATH,
                      30,
                      language="ru",
                      rnn=LSTM,
                      units1=512,
                      units2=512,
                      dropout=0.4,
                      batch_size=128,
                      emb_dimension=50)
    clf.build()
    clf.train(G2P_CURRENT_MODEL_DIR, enable_checkpoints=True)
예제 #4
0
    def __init__(self, language: str = "ru", g2p_model_path: str = None):
        self.language = language
        self.g2p_model_path = g2p_model_path

        if language == "ru":
            self.__init_language_defaults(RU_G2P_DEFAULT_MODEL)
        elif language == "en":
            self.__init_language_defaults(EN_G2P_DEFAULT_MODEL)
        else:
            raise RuntimeError("Wrong language")

        if not os.path.exists(self.g2p_model_path):
            raise RuntimeError("No g2p model available (or wrong path)")

        self.g2p_model = RNNG2PModel(language=language)
        self.g2p_model.load(self.g2p_model_path)
예제 #5
0
 def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path,
                               g2p_model):
     from rupo.g2p.rnn import RNNG2PModel
     from rupo.g2p.aligner import Aligner
     from rupo.stress.dict import StressDict
     g2p_predictor = RNNG2PModel(g2p_dict_path)
     g2p_predictor.load(g2p_model)
     aligner = Aligner()
     grapheme_stress_dict_path = os.path.join(
         os.path.dirname(os.path.abspath(source_file)),
         "ru_grapheme_stress.txt")
     ZalyzniakDict.convert_to_accent_only(source_file,
                                          grapheme_stress_dict_path)
     d = StressDict(raw_dict_path=grapheme_stress_dict_path)
     vowels = set(Phonemes.VOWELS)
     with open(destination_file, 'w', encoding='utf-8') as w:
         samples = 0
         for word, accents in d.get_all():
             primary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.PRIMARY
             ]
             secondary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.SECONDARY
             ]
             phonemes = g2p_predictor.predict([word])[0]
             g, p = aligner.align(word, phonemes)
             primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict)
             secondary = ZalyzniakDict.align_stresses(
                 g, p, secondary_in_dict)
             is_valid = True
             for stress in primary + secondary:
                 if p[stress] not in vowels:
                     print(g, p, stress, p[stress])
                     is_valid = False
             if is_valid:
                 w.write(phonemes + "\t" +
                         ",".join([str(i) for i in primary]) + "\t" +
                         ",".join([str(i) for i in secondary]) + "\n")
             samples += 1
             if samples % 1000 == 0:
                 print(samples)
예제 #6
0
 def convert_to_g2p_only(dict_file, g2p_dict_path, g2p_model):
     from rupo.g2p.rnn import RNNG2PModel
     g2p_predictor = RNNG2PModel()
     g2p_predictor.load(g2p_model)
     with open(dict_file, 'r', encoding='utf-8') as r:
         lines = r.readlines()
     with open(g2p_dict_path, 'w', encoding='utf-8') as w:
         words = []
         for line in lines:
             for word in line.split("#")[1].split(","):
                 word = word.strip()
                 clean_word = ""
                 for i, ch in enumerate(word):
                     if ch == "'" or ch == "`":
                         continue
                     clean_word += ch
                 words.append(clean_word)
         phonetic_words = g2p_predictor.predict(words)
         for i, word in enumerate(words):
             w.write(word + "\t" + phonetic_words[i] + "\n")
예제 #7
0
    def __init__(self,
                 language: str = "ru",
                 stress_model_path: str = None,
                 g2p_model_path: str = None,
                 grapheme_set=RU_GRAPHEME_SET,
                 g2p_dict_path=None,
                 aligner_dump_path=None,
                 ru_wiki_dict=RU_WIKI_DICT,
                 cmu_dict=CMU_DICT):
        self.language = language
        self.stress_model_path = stress_model_path
        self.g2p_model_path = g2p_model_path

        if language == "ru":
            self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL,
                                          RU_G2P_DEFAULT_MODEL)
        elif language == "en":
            self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL,
                                          EN_G2P_DEFAULT_MODEL)
        else:
            raise RuntimeError("Wrong language")

        if not os.path.exists(self.stress_model_path) or not os.path.exists(
                self.g2p_model_path):
            raise RuntimeError(
                "No stress or g2p models available (or wrong paths)")

        self.stress_model = RNNStressModel(language=language)
        self.stress_model.load(self.stress_model_path)
        self.g2p_model = RNNG2PModel(language=language)
        self.g2p_model.load(self.g2p_model_path)
        self.aligner = Aligner(language,
                               grapheme_set,
                               g2p_dict_path,
                               aligner_dump_path,
                               ru_wiki_dict=ru_wiki_dict,
                               cmu_dict=cmu_dict)