def train(): """DeepSpeech2 training.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() train_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, place=place) dev_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, place=place) train_batch_reader = train_generator.batch_reader_creator(manifest_path=args.train_manifest, batch_size=args.batch_size, sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator(manifest_path=args.dev_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model(vocab_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.init_from_pretrained_model, output_model_dir=args.output_model_dir) ds2_model.train(train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, learning_rate=args.learning_rate, gradient_clipping=400, batch_size=args.batch_size, num_samples=args.num_samples, num_epoch=args.num_epoch, save_epoch=args.save_epoch, num_iterations_print=args.num_iter_print, test_off=args.test_off)
def load_model(): # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # Load model data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = data_generator.vocab_list ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) return ds2_model, data_generator, vocab_list
def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) errors_func = char_errors if args.error_rate_type == 'cer' else word_errors errors_sum, len_refs, num_ins = 0.0, 0, 0 ds2_model.logger.info("start evaluation ...") for infer_data in batch_reader(): probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) if args.decoding_method == "ctc_greedy": result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) target_transcripts = infer_data[1] for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 print("Error rate [%s] (%d/?) = %f" % (args.error_rate_type, num_ins, errors_sum / len_refs)) print("Final error rate [%s] (%d/%d) = %f" % (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) ds2_model.logger.info("finish evaluation")
def evaluate(): # 检测PaddlePaddle环境 check_cuda(args.use_gpu) check_version() # 是否使用GPU place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() # 获取数据生成器 data_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取评估数据 batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, shuffle_method=None) # 获取DeepSpeech2模型,并设置为预测 ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path, is_infer=True) # 读取数据列表 with open(args.test_manifest, 'r', encoding='utf-8') as f_m: test_len = len(f_m.readlines()) # 定向搜索方法的处理 if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, data_generator.vocab_list) # 获取评估函数,有字错率和词错率 errors_func = char_errors if args.error_rate_type == 'cer' else word_errors errors_sum, len_refs, num_ins = 0.0, 0, 0 ds2_model.logger.info("开始评估 ...") start = time.time() # 开始评估 for infer_data in batch_reader(): # 获取一批的识别结果 probs_split = ds2_model.infer_batch_probs(infer_data=infer_data) # 执行解码 if args.decoding_method == "ctc_greedy": # 最优路径解码 result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=data_generator.vocab_list) else: # 定向搜索解码 result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=data_generator.vocab_list, num_processes=args.num_proc_bsearch) target_transcripts = infer_data[1] # 计算字错率 for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 print("错误率:[%s] (%d/%d) = %f" % (args.error_rate_type, num_ins, test_len, errors_sum / len_refs)) end = time.time() print("消耗时间:%ds, 总错误率:[%s] (%d/%d) = %f" % ((end - start), args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) ds2_model.logger.info("完成评估!")
def train(): # 检测PaddlePaddle环境 check_cuda(args.use_gpu) check_version() # 是否使用GPU if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # 获取训练数据生成器 train_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config=io.open( args.augment_conf_path, mode='r', encoding='utf8').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, place=place) # 获取测试数据生成器 test_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取训练数据 train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest, batch_size=args.batch_size, sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False, shuffle_method=args.shuffle_method) # 获取测试数据 test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) # 获取DeepSpeech2模型 ds2_model = DeepSpeech2Model( vocab_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.init_from_pretrained_model, output_model_dir=args.output_model_dir, vocab_list=test_generator.vocab_list) # 获取训练数据数量 num_samples = get_data_len(args.train_manifest, args.max_duration, args.min_duration) print("[%s] 训练数据数量:%d\n" % (datetime.now(), num_samples)) # 开始训练 ds2_model.train(train_batch_reader=train_batch_reader, dev_batch_reader=test_batch_reader, learning_rate=args.learning_rate, gradient_clipping=400, batch_size=args.batch_size, num_samples=num_samples, num_epoch=args.num_epoch, save_epoch=args.save_epoch, num_iterations_print=args.num_iter_print, test_off=args.test_off)
def infer(): """Inference for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) infer_data = next(batch_reader()) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") probs_split= ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts= ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = infer_data[1] for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) print("Current error rate [%s] = %f" % (args.error_rate_type, error_rate_func(target, result))) ds2_model.logger.info("finish inference")
def infer(transcript_name): """Inference for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) infer_data = next(batch_reader()) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") probs_split= ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts= ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) transcription = result_transcripts[0].capitalize() + '.' print(transcription) with codecs.open('dataset/tap/transcription/'+transcript_name+'.txt', 'w', 'utf-8') as out_file: out_file.write(transcription) ds2_model.logger.info("finish inference")