def test_wfst_decoder_normal_transition(): phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) fst_compiler = _FstCompiler() eps = phoneme_table.get_epsilon_id() blank = phoneme_table.get_blank_id() a = phoneme_table.get_label_id('a') i = phoneme_table.get_label_id('i') fst_compiler.add_arc(0, 1, blank, eps, 0.2) fst_compiler.add_arc(1, 2, a, eps, 0.1) fst_compiler.add_arc(1, 3, i, eps, 0.2) fst = fst_compiler.compile() wfst_decoder = WFSTDecoder(fst) prev_paths = { 0: wfst_decoder.Path(score=0, prev_path=None, frame_index=0, olabel=None) } curr_paths = {} wfst_decoder.normal_transition(prev_paths, curr_paths, 0, blank) assert 1 in curr_paths assert round(curr_paths[1].score, 6) == 0.2 assert round(curr_paths[1].prev_path.score, 6) == 0 prev_paths = curr_paths curr_paths = {} wfst_decoder.normal_transition(prev_paths, curr_paths, 1, a) assert 2 in curr_paths assert round(curr_paths[2].score, 6) == 0.3 assert curr_paths[2].frame_index == 1 assert round(curr_paths[2].prev_path.score, 6) == 0.2
def test_token_create_fst_with_auxiliary_labels(): phoneme_table = PhonemeTable() phoneme_table.add_labels(['a', 'i']) epsilon_id = phoneme_table.get_epsilon_id() blank_id = phoneme_table.get_blank_id() a = phoneme_table.get_label_id('a') i = phoneme_table.get_label_id('i') phoneme_table.set_auxiliary_label('#0') phoneme_table.set_auxiliary_label('#1') aux0 = phoneme_table.get_auxiliary_label_id('#0') aux1 = phoneme_table.get_auxiliary_label_id('#1') fst = Token().create_fst(phoneme_table) assert (fst.num_states() == 5) # start state state = 0 assert (fst.num_arcs(state) == 3) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), blank_id, epsilon_id, state) is_expected_arc(next(gen_arc), a, a, 3) is_expected_arc(next(gen_arc), i, i, 4) # second state state = 1 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), blank_id, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 2) # final(auxiliary) state state = 2 assert (fst.num_arcs(state) == 3) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 0) is_expected_arc(next(gen_arc), epsilon_id, aux0, state) is_expected_arc(next(gen_arc), epsilon_id, aux1, state) # a state = 3 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), a, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1) # b state = 4 assert (fst.num_arcs(state) == 2) gen_arc = fst.arcs(state) is_expected_arc(next(gen_arc), i, epsilon_id, state) is_expected_arc(next(gen_arc), epsilon_id, epsilon_id, 1)
def test_wfst_decoder_decode(workdir, words_for_corpus_with_homophones): corpus_path = os.path.join(workdir, 'corpus.txt') create_corpus(corpus_path, words_for_corpus_with_homophones) vocab_path = os.path.join(workdir, 'vocab.syms') vocab = create_vocabulary_symbol_table(vocab_path, corpus_path) phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) lexicon = get_lexicon(words_for_corpus_with_homophones) lexicon_fst = lexicon.create_fst(phoneme_table, vocab, min_freq=0) token = Token() token_fst = token.create_fst(phoneme_table) grammar_path = os.path.join(workdir, 'grammar.fst') grammar = Grammar() grammar_fst = grammar.create_fst(grammar_path, vocab_path, corpus_path) wfst_decoder = WFSTDecoder() wfst_decoder.create_fst(token_fst, lexicon_fst, grammar_fst) blank_id = phoneme_table.get_blank_id() a = phoneme_table.get_label_id('a') i = phoneme_table.get_label_id('i') d = phoneme_table.get_label_id('d') e = phoneme_table.get_label_id('e') s = phoneme_table.get_label_id('s') o = phoneme_table.get_label_id('o') m = phoneme_table.get_label_id('m') r = phoneme_table.get_label_id('r') u = phoneme_table.get_label_id('u') frame_labels = [ blank_id, blank_id, a, a, i, i, i, d, e, blank_id, s, s, o, o, o, m, e, r, r, u ] got = wfst_decoder.decode(frame_labels, vocab) assert got == '藍で染める'
args.development_data_dirname) repository_dev = DevelopmentDatasetRepository(development_data_dirpath) dataloaders_dev = [] for dataset_dev in AudioDataset.load_all(repository_dev, phoneme_table): dataloader_dev = DataLoader(dataset_dev, batch_size=args.batch_size, collate_fn=collate_for_ctc) dataloaders_dev.append(dataloader_dev) feature_params_path = os.path.join(args.workdir, args.feature_params_file) feature_params = FeatureParams.load(feature_params_path) model_path = os.path.join(args.workdir, args.model_file) if args.resume is True: print('Loading model ...') model = EESENAcousticModel.load(model_path) else: print('Initializing model ...') blank = phoneme_table.get_blank_id() model = EESENAcousticModel(feature_params.feature_size, args.hidden_size, args.num_layers, phoneme_table.num_labels(), blank=blank) model.to(torch.device(args.device)) model.set_optimizer(args.optimizer, args.lr) print('Training ...') model.train(dataloader_tr, dataloaders_dev, args.epochs) print('Saving model ...') model.save(model_path)
def test_phoneme_table_get_blank_id(): phoneme_table = PhonemeTable() assert phoneme_table.get_blank_id() == 1 assert phoneme_table.get_label_id('<blank>') == 1 assert phoneme_table.get_label(1) == '<blank>'
phoneme_table = PhonemeTable() phoneme_table.add_labels(phonemes) epsilon_id = phoneme_table.get_epsilon_id() print('Loading model ...') model_path = os.path.join(args.workdir, args.model_file) model = EESENAcousticModel.load(model_path) feature_params_path = os.path.join(args.workdir, args.feature_params_file) feature_params = FeatureParams.load(feature_params_path) batch = [] for wav_file in args.wav_files: data = extract_feature_from_wavfile(wav_file, feature_params) batch.append(torch.from_numpy(data)) output = model.predict(pad_sequence(batch)) for idx, wav_file in enumerate(args.wav_files): print('Decoding {} ... '.format(wav_file)) frame_labels = [int(frame_label) for frame_label in output[:, idx]] print(' acoustic labels = {}'.format(' '.join( [phoneme_table.get_label(frame_label) for frame_label in frame_labels if frame_label != phoneme_table.get_blank_id()])) ) vocabulary_symbol_path = os.path.join( args.workdir, args.vocabulary_symbol_file) vocab_symbol = VocabularySymbolTable.load_symbol( vocabulary_symbol_path) decoder_fst_path = os.path.join(args.workdir, args.decoder_fst_file) wfst_decoder = WFSTDecoder() wfst_decoder.read_fst(decoder_fst_path) print(' text = {} '.format(wfst_decoder.decode( frame_labels, vocab_symbol, epsilon_id=epsilon_id)))