def generate_g2p_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = Corpus(input_dir, data_directory) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = set() with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.update(line.strip().split()) model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate()
def test_training(sick_dict, sick_g2p_model_path): trainer = PhonetisaurusTrainer(sick_dict, sick_g2p_model_path, korean=False) trainer.train() model = G2PModel(sick_g2p_model_path) assert (model.meta['version'] == __version__) assert (model.meta['architecture'] == 'phonetisaurus') assert (model.meta['phones'] == sick_dict.nonsil_phones)
def test_training(sick_dict, sick_g2p_model_path): trainer = PhonetisaurusTrainer(sick_dict, sick_g2p_model_path, window_size=2) trainer.validate() trainer.train() model = G2PModel(sick_g2p_model_path) assert model.meta['version'] == __version__ assert model.meta['architecture'] == 'phonetisaurus' assert model.meta['phones'] == sick_dict.nonsil_phones
def generate_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) input_dir = os.path.expanduser(args.corpus_directory) corpus = Corpus(input_dir, os.path.join(temp_dir, 'corpus')) word_set = corpus.word_set model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate()
def generate_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) input_dir = os.path.expanduser(args.corpus_directory) corpus = Corpus(input_dir, "") model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, corpus, args.output_path, temp_directory=temp_dir, korean=args.korean) gen.generate()
def generate_dictionary(args): print("Generating pronunciations from G2P model") if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = Corpus(input_dir, data_directory) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = set() with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.update(line.strip().split()) if not args.include_bracketed: word_set = [x for x in word_set if not check_bracketed(x)] if args.g2p_model_path is not None: model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate() else: with open(args.output_path, "w", encoding='utf8') as f: for word in sorted(word_set): pronunciation = list(word) f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
def test_generator(sick_g2p_model_path, sick_corpus, g2p_sick_output): model = G2PModel(sick_g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, sick_corpus, g2p_sick_output) gen.generate() assert (os.path.exists(g2p_sick_output))
outdir = tempfile.mkdtemp() # Encode audio into corpse dir subprocess.call([ 'ffmpeg', '-i', AUDIO_PATH, '-ar', '16000', '-ac', '1', os.path.join(corpse_dir_in, '1.wav') ]) # Copy text file open(os.path.join(corpse_dir_in, '1.lab'), 'w').write(open(TEXT_PATH).read()) corpus = Corpus(corpse_dir_in, corpse_dir_out) acoustic_model = AcousticModel('spanish.zip') g2p_model = G2PModel('spanish_g2p.zip') dict_dir = tempfile.mkdtemp() with tempfile.NamedTemporaryFile() as g2pfh: d_gen = PhonetisaurusDictionaryGenerator(g2p_model, WORDS, g2pfh.name) d_gen.generate() dictionary = Dictionary(g2pfh.name, dict_dir) acoustic_model.validate(dictionary) aligner = PretrainedAligner(corpus, dictionary, acoustic_model, outdir,
outdir = tempfile.mkdtemp() # Encode audio into corpse dir subprocess.call([ 'ffmpeg', '-i', AUDIO_PATH, '-ar', '16000', '-ac', '1', os.path.join(corpse_dir_in, '1.wav') ]) # Copy text file open(os.path.join(corpse_dir_in, '1.lab'), 'w').write(open(TEXT_PATH).read()) corpus = Corpus(corpse_dir_in, corpse_dir_out) acoustic_model = AcousticModel('%s.zip' % (LANGUAGE)) g2p_model = G2PModel('%s_g2p.zip' % (LANGUAGE)) dict_dir = 'rush/dict' #tempfile.mkdtemp() os.makedirs(dict_dir) g2pname = 'rush/g2p' with tempfile.NamedTemporaryFile() as g2pfh: d_gen = PhonetisaurusDictionaryGenerator(g2p_model, WORDS, g2pname) #g2pfh.name) d_gen.generate() dictionary = Dictionary(g2pname, dict_dir) #g2pfh.name, dict_dir) acoustic_model.validate(dictionary) aligner = PretrainedAligner(corpus,