Exemplo n.º 1
0
def generate_g2p_dict(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
        temp_dir = os.path.join(temp_dir, 'G2P')
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    if os.path.isdir(args.input_path):
        input_dir = os.path.expanduser(args.input_path)
        corpus_name = os.path.basename(args.input_path)
        if corpus_name == '':
            args.input_path = os.path.dirname(args.input_path)
            corpus_name = os.path.basename(args.input_path)
        data_directory = os.path.join(temp_dir, corpus_name)

        corpus = Corpus(input_dir, data_directory)

        word_set = get_word_set(corpus, args.include_bracketed)

    else:
        word_set = set()
        with open(args.input_path, 'r', encoding='utf8') as f:
            for line in f:
                word_set.update(line.strip().split())
    model = G2PModel(args.g2p_model_path)
    gen = PhonetisaurusDictionaryGenerator(model,
                                           word_set,
                                           args.output_path,
                                           temp_directory=temp_dir)
    gen.generate()
Exemplo n.º 2
0
def test_training(sick_dict, sick_g2p_model_path):
    trainer = PhonetisaurusTrainer(sick_dict, sick_g2p_model_path, korean=False)
    trainer.train()
    model = G2PModel(sick_g2p_model_path)
    assert (model.meta['version'] == __version__)
    assert (model.meta['architecture'] == 'phonetisaurus')
    assert (model.meta['phones'] == sick_dict.nonsil_phones)
Exemplo n.º 3
0
def test_training(sick_dict, sick_g2p_model_path):
    trainer = PhonetisaurusTrainer(sick_dict, sick_g2p_model_path, window_size=2)
    trainer.validate()
    trainer.train()
    model = G2PModel(sick_g2p_model_path)
    assert model.meta['version'] == __version__
    assert model.meta['architecture'] == 'phonetisaurus'
    assert model.meta['phones'] == sick_dict.nonsil_phones
Exemplo n.º 4
0
def generate_dict(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    input_dir = os.path.expanduser(args.corpus_directory)

    corpus = Corpus(input_dir, os.path.join(temp_dir, 'corpus'))
    word_set = corpus.word_set
    model = G2PModel(args.g2p_model_path)

    gen = PhonetisaurusDictionaryGenerator(model,
                                           word_set,
                                           args.output_path,
                                           temp_directory=temp_dir)
    gen.generate()
def generate_dict(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    input_dir = os.path.expanduser(args.corpus_directory)
    corpus = Corpus(input_dir, "")

    model = G2PModel(args.g2p_model_path)

    gen = PhonetisaurusDictionaryGenerator(model,
                                           corpus,
                                           args.output_path,
                                           temp_directory=temp_dir,
                                           korean=args.korean)
    gen.generate()
Exemplo n.º 6
0
def generate_dictionary(args):
    print("Generating pronunciations from G2P model")
    if not args.temp_directory:
        temp_dir = TEMP_DIR
        temp_dir = os.path.join(temp_dir, 'G2P')
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    if os.path.isdir(args.input_path):
        input_dir = os.path.expanduser(args.input_path)
        corpus_name = os.path.basename(args.input_path)
        if corpus_name == '':
            args.input_path = os.path.dirname(args.input_path)
            corpus_name = os.path.basename(args.input_path)
        data_directory = os.path.join(temp_dir, corpus_name)

        corpus = Corpus(input_dir, data_directory)

        word_set = get_word_set(corpus, args.include_bracketed)

    else:
        word_set = set()
        with open(args.input_path, 'r', encoding='utf8') as f:
            for line in f:
                word_set.update(line.strip().split())
        if not args.include_bracketed:
            word_set = [x for x in word_set if not check_bracketed(x)]
    if args.g2p_model_path is not None:
        model = G2PModel(args.g2p_model_path)
        gen = PhonetisaurusDictionaryGenerator(model,
                                               word_set,
                                               args.output_path,
                                               temp_directory=temp_dir)
        gen.generate()
    else:
        with open(args.output_path, "w", encoding='utf8') as f:
            for word in sorted(word_set):
                pronunciation = list(word)
                f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
Exemplo n.º 7
0
def test_generator(sick_g2p_model_path, sick_corpus, g2p_sick_output):
    model = G2PModel(sick_g2p_model_path)
    gen = PhonetisaurusDictionaryGenerator(model, sick_corpus, g2p_sick_output)
    gen.generate()
    assert (os.path.exists(g2p_sick_output))
Exemplo n.º 8
0
outdir = tempfile.mkdtemp()

# Encode audio into corpse dir
subprocess.call([
    'ffmpeg', '-i', AUDIO_PATH, '-ar', '16000', '-ac', '1',
    os.path.join(corpse_dir_in, '1.wav')
])

# Copy text file
open(os.path.join(corpse_dir_in, '1.lab'), 'w').write(open(TEXT_PATH).read())

corpus = Corpus(corpse_dir_in, corpse_dir_out)

acoustic_model = AcousticModel('spanish.zip')
g2p_model = G2PModel('spanish_g2p.zip')

dict_dir = tempfile.mkdtemp()

with tempfile.NamedTemporaryFile() as g2pfh:
    d_gen = PhonetisaurusDictionaryGenerator(g2p_model, WORDS, g2pfh.name)
    d_gen.generate()

    dictionary = Dictionary(g2pfh.name, dict_dir)

acoustic_model.validate(dictionary)

aligner = PretrainedAligner(corpus,
                            dictionary,
                            acoustic_model,
                            outdir,
Exemplo n.º 9
0
outdir = tempfile.mkdtemp()

# Encode audio into corpse dir
subprocess.call([
    'ffmpeg', '-i', AUDIO_PATH, '-ar', '16000', '-ac', '1',
    os.path.join(corpse_dir_in, '1.wav')
])

# Copy text file
open(os.path.join(corpse_dir_in, '1.lab'), 'w').write(open(TEXT_PATH).read())

corpus = Corpus(corpse_dir_in, corpse_dir_out)

acoustic_model = AcousticModel('%s.zip' % (LANGUAGE))
g2p_model = G2PModel('%s_g2p.zip' % (LANGUAGE))

dict_dir = 'rush/dict'  #tempfile.mkdtemp()
os.makedirs(dict_dir)
g2pname = 'rush/g2p'

with tempfile.NamedTemporaryFile() as g2pfh:
    d_gen = PhonetisaurusDictionaryGenerator(g2p_model, WORDS,
                                             g2pname)  #g2pfh.name)
    d_gen.generate()

    dictionary = Dictionary(g2pname, dict_dir)  #g2pfh.name, dict_dir)

acoustic_model.validate(dictionary)

aligner = PretrainedAligner(corpus,