def generate_data(label_type, model, batch_size=1): """ Args: label_type: character or phone or multitask model: ctc or attention Returns: inputs: `[batch_size, max_time, feature_dim]` labels: `[batch_size]` inputs_seq_len: `[batch_size, frame_num]` labels_seq_len: `[batch_size]` (if model is attention) """ # Make input data inputs, inputs_seq_len = read_wav('./sample/LDC93S1.wav', feature_type='logmelfbank', batch_size=batch_size) # inputs, inputs_seq_len = read_wav('../sample/LDC93S1.wav', # feature_type='mfcc', # batch_size=batch_size) if model == 'ctc': if label_type == 'character': transcript = read_text('./sample/LDC93S1.txt') transcript = ' ' + transcript.replace('.', '') + ' ' labels = [alpha2num(transcript)] * batch_size # Convert to SparseTensor labels = list2sparsetensor(labels, padded_value=-1) return inputs, labels, inputs_seq_len elif label_type == 'phone': transcript = read_phone('./sample/LDC93S1.phn') labels = [phone2num(transcript)] * batch_size # Convert to SparseTensor labels = list2sparsetensor(labels, padded_value=-1) return inputs, labels, inputs_seq_len elif label_type == 'multitask': transcript_char = read_text('./sample/LDC93S1.txt') transcript_phone = read_phone('./sample/LDC93S1.phn') transcript_char = ' ' + transcript_char.replace('.', '') + ' ' labels_char = [alpha2num(transcript_char)] * batch_size labels_phone = [phone2num(transcript_phone)] * batch_size # Convert to SparseTensor labels_char = list2sparsetensor(labels_char, padded_value=-1) labels_phone = list2sparsetensor(labels_phone, padded_value=-1) return inputs, labels_char, labels_phone, inputs_seq_len elif model == 'attention': if label_type == 'character': transcript = read_text('./sample/LDC93S1.txt') transcript = '<' + transcript.replace('.', '') + '>' labels = [alpha2num(transcript)] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'phone': transcript = read_phone('./sample/LDC93S1.phn') transcript = '< ' + transcript + ' >' labels = [phone2num(transcript)] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'multitask': transcript_char = read_text('./sample/LDC93S1.txt') transcript_phone = read_phone('./sample/LDC93S1.phn') transcript_char = '<' + transcript_char.replace('.', '') + '>' transcript_phone = '< ' + transcript_phone + ' >' labels_char = [alpha2num(transcript_char)] * batch_size labels_phone = [phone2num(transcript_phone)] * batch_size target_len_char = [len(labels_char[0])] * batch_size target_len_phone = [len(labels_phone[0])] * batch_size return (inputs, labels_char, labels_phone, inputs_seq_len, target_len_char, target_len_phone) elif model == 'joint_ctc_attention': if label_type == 'character': transcript = read_text('./sample/LDC93S1.txt') att_transcript = '<' + transcript.replace('.', '') + '>' ctc_transcript = ' ' + transcript.replace('.', '') + ' ' att_labels = [alpha2num(att_transcript)] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [alpha2num(ctc_transcript)] * batch_size # Convert to SparseTensor ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1) return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels elif label_type == 'phone': transcript = read_phone('./sample/LDC93S1.phn') att_transcript = '< ' + transcript + ' >' att_labels = [phone2num(att_transcript)] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [phone2num(transcript)] * batch_size # Convert to SparseTensor ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1) return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels
def check_training(self, model_type, label_type): print('----- ' + model_type + ', ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, inputs_seq_len, labels_seq_len = generate_data( label_type=label_type, model='attention', batch_size=batch_size) # Define placeholders inputs_pl = tf.placeholder( tf.float32, shape=[batch_size, None, inputs.shape[-1]], name='input') # `[batch_size, max_time]` labels_pl = tf.placeholder(tf.int32, shape=[None, None], name='label') # These are prepared for computing LER indices_true_pl = tf.placeholder(tf.int64, name='indices') values_true_pl = tf.placeholder(tf.int32, name='values') shape_true_pl = tf.placeholder(tf.int64, name='shape') labels_st_true_pl = tf.SparseTensor(indices_true_pl, values_true_pl, shape_true_pl) indices_pred_pl = tf.placeholder(tf.int64, name='indices') values_pred_pl = tf.placeholder(tf.int32, name='values') shape_pred_pl = tf.placeholder(tf.int64, name='shape') labels_st_pred_pl = tf.SparseTensor(indices_pred_pl, values_pred_pl, shape_pred_pl) inputs_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='inputs_seq_len') labels_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='labels_seq_len') keep_prob_input_pl = tf.placeholder(tf.float32, name='keep_prob_input') keep_prob_hidden_pl = tf.placeholder(tf.float32, name='keep_prob_hidden') # Define model graph output_size = 26 + 2 if label_type == 'character' else 61 + 2 # model = load(model_type=model_type) network = BLSTMAttetion(batch_size=batch_size, input_size=inputs[0].shape[1], encoder_num_unit=256, encoder_num_layer=2, attention_dim=128, decoder_num_unit=256, decoder_num_layer=1, embedding_dim=20, output_size=output_size, sos_index=output_size - 2, eos_index=output_size - 1, max_decode_length=50, attention_weights_tempareture=1, logits_tempareture=1, parameter_init=0.1, clip_grad=5.0, clip_activation_encoder=50, clip_activation_decoder=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=0, beam_width=0, time_major=False) # Add to the graph each operation loss_op, logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss( inputs_pl, labels_pl, inputs_seq_len_pl, labels_seq_len_pl, keep_prob_input_pl, keep_prob_hidden_pl) learning_rate = 1e-3 train_op = network.train(loss_op, optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_train, decode_op_infer = network.decoder( decoder_outputs_train, decoder_outputs_infer, decode_type='greedy', beam_width=1) ler_op = network.compute_ler(labels_st_true_pl, labels_st_pred_pl) attention_weights = decoder_outputs_infer.attention_scores # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { inputs_pl: inputs, labels_pl: labels, inputs_seq_len_pl: inputs_seq_len, labels_seq_len_pl: labels_seq_len, keep_prob_input_pl: network.dropout_ratio_input, keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr: learning_rate } with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[keep_prob_input_pl] = 1.0 feed_dict[keep_prob_hidden_pl] = 1.0 # Predict class ids predicted_ids_train, predicted_ids_infer = sess.run( [decode_op_train, decode_op_infer], feed_dict=feed_dict) # Compute accuracy feed_dict_ler = { labels_st_true_pl: list2sparsetensor(labels), labels_st_pred_pl: list2sparsetensor(predicted_ids_infer) } ler_train = sess.run(ler_op, feed_dict=feed_dict_ler) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' % (step + 1, loss_train, ler_train, duration_step)) start_time_step = time.time() # Visualize if label_type == 'character': print('True : %s' % num2alpha(labels[0])) print('Pred (Training) : <%s' % num2alpha(predicted_ids_train[0])) print('Pred (Inference): <%s' % num2alpha(predicted_ids_infer[0])) else: print('True : %s' % num2phone(labels[0])) print('Pred (Training) : < %s' % num2phone(predicted_ids_train[0])) print('Pred (Inference): < %s' % num2phone(predicted_ids_infer[0])) if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 5: print('Model is Converged.') break ler_train_pre = ler_train duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def do_eval_per(session, decode_op, per_op, network, dataset, label_type, eos_index, eval_batch_size=None, is_progressbar=False): """Evaluate trained model by Phone Error Rate. Args: session: session of training model decode_op: operation for decoding per_op: operation for computing phone error rate network: network to evaluate dataset: An instance of a `Dataset' class label_type: string, phone39 or phone48 or phone61 eos_index: int, the index of <EOS> class eval_batch_size: int, the batch size when evaluating the model is_progressbar: if True, visualize the progressbar Returns: per_global: An average of PER """ if eval_batch_size is not None: batch_size = eval_batch_size else: batch_size = dataset.batch_size train_label_type = label_type data_label_type = dataset.label_type num_examples = dataset.data_num iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 per_global = 0 # Make data generator mini_batch = dataset.next_batch(batch_size=batch_size) phone2num_map_file_path = '../metrics/mapping_files/attention/phone2num_' + \ train_label_type[5:7] + '.txt' phone2num_39_map_file_path = '../metrics/mapping_files/attention/phone2num_39.txt' phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt' for step in wrap_iterator(range(iteration), is_progressbar): # Create feed dictionary for next mini-batch inputs, att_labels_true, _, inputs_seq_len, _, _ = mini_batch.__next__( ) feed_dict = { network.inputs: inputs, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: 1.0, network.keep_prob_hidden: 1.0 } batch_size_each = len(inputs_seq_len) if False: # Evaluate by 61 phones per_local = session.run(per_op, feed_dict=feed_dict) per_global += per_local * batch_size_each else: # Evaluate by 39 phones predicted_ids = session.run(decode_op, feed_dict=feed_dict) predicted_ids_phone39 = [] labels_true_phone39 = [] for i_batch in range(batch_size_each): # Convert from num to phone (-> list of phone strings) phone_pred_seq = num2phone(predicted_ids[i_batch], phone2num_map_file_path) phone_pred_list = phone_pred_seq.split(' ') # Mapping to 39 phones (-> list of phone strings) phone_pred_list = map_to_39phone(phone_pred_list, train_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_pred_list = phone2num(phone_pred_list, phone2num_39_map_file_path) predicted_ids_phone39.append(phone_pred_list) if data_label_type != 'phone39': # Convert from num to phone (-> list of phone strings) phone_true_seq = num2phone(att_labels_true[i_batch], phone2num_map_file_path) phone_true_list = phone_true_seq.split(' ') # Mapping to 39 phones (-> list of phone strings) phone_true_list = map_to_39phone( phone_true_list, train_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_true_list = phone2num(phone_true_list, phone2num_39_map_file_path) labels_true_phone39.append(phone_true_list) else: labels_true_phone39 = att_labels_true # Compute edit distance labels_true_st = list2sparsetensor(labels_true_phone39, padded_value=eos_index) labels_pred_st = list2sparsetensor(predicted_ids_phone39, padded_value=eos_index) per_local = compute_edit_distance(session, labels_true_st, labels_pred_st) per_global += per_local * batch_size_each per_global /= dataset.data_num return per_global
def next_batch(self, batch_size=None, session=None): """Make mini-batch. Args: batch_size: int, the size of mini-batch session: Returns: inputs: list of input data, size `[batch_size]` att_labels: list of target labels, size `[batch_size]` ctc_labels_st: list of SparseTensor of taret labels inputs_seq_len: list of length of inputs of size `[batch_size]` att_labels_seq_len: list of length of target labels of size `[batch_size]` input_names: list of file name of input data of size `[batch_size]` If num_gpu > 1, each return is divide into list of size `[num_gpu]`. """ if session is None and self.num_gpu != 1: raise ValueError('Set session when using multiple GPUs.') if batch_size is None: batch_size = self.batch_size next_epoch_flag = False ctc_padded_value = -1 while True: # sorted dataset if self.is_sorted: if len(self.rest) > batch_size: data_indices = list(self.rest)[:batch_size] self.rest -= set(data_indices) else: data_indices = list(self.rest) self.rest = set(range(0, self.data_num, 1)) next_epoch_flag = True if self.data_type == 'train': print('---Next epoch---') # not sorted dataset else: if len(self.rest) > batch_size: # Randomly sample mini-batch data_indices = random.sample(list(self.rest), batch_size) self.rest -= set(data_indices) else: data_indices = list(self.rest) self.rest = set(range(0, self.data_num, 1)) next_epoch_flag = True if self.data_type == 'train': print('---Next epoch---') # Shuffle selected mini-batch random.shuffle(data_indices) # Compute max frame num in mini-batch max_frame_num = max(map(lambda x: x.shape[0], self.input_list[data_indices])) # Compute max target label length in mini-batch att_max_seq_len = max( map(len, self.att_label_list[data_indices])) ctc_max_seq_len = max( map(len, self.ctc_label_list[data_indices])) # Initialization inputs = np.zeros( (len(data_indices), max_frame_num, self.input_size), dtype=np.int32) # Padding with <EOS> att_labels = np.array([[self.eos_index] * att_max_seq_len] * len(data_indices), dtype=np.int32) ctc_labels = np.array([[ctc_padded_value] * ctc_max_seq_len] * len(data_indices), dtype=np.int32) inputs_seq_len = np.zeros((len(data_indices),), dtype=np.int32) att_labels_seq_len = np.zeros( (len(data_indices),), dtype=np.int32) input_names = list( map(lambda path: basename(path).split('.')[0], np.take(self.input_paths, data_indices, axis=0))) # Set values of each data in mini-batch for i_batch, x in enumerate(data_indices): data_i = self.input_list[x] frame_num = data_i.shape[0] inputs[i_batch, :frame_num, :] = data_i att_labels[i_batch, :len(self.att_label_list[x]) ] = self.att_label_list[x] ctc_labels[i_batch, :len( self.ctc_label_list[x])] = self.ctc_label_list[x] inputs_seq_len[i_batch] = frame_num att_labels_seq_len[i_batch] = len(self.att_label_list[x]) ########## # GPU ########## if self.num_gpu > 1: divide_num = self.num_gpu if next_epoch_flag: for i in range(self.num_gpu, 0, -1): if len(self.rest) % i == 0: divide_num = i break next_epoch_flag = False # Now we split the mini-batch data by num_gpu inputs = tf.split(inputs, divide_num, axis=0) att_labels = tf.split(att_labels, divide_num, axis=0) ctc_labels = tf.split(ctc_labels, divide_num, axis=0) inputs_seq_len = tf.split(inputs_seq_len, divide_num, axis=0) att_labels_seq_len = tf.split( att_labels_seq_len, divide_num, axis=0) input_names = tf.split(input_names, divide_num, axis=0) # Convert from SparseTensor to numpy.ndarray inputs = list(map(session.run, inputs)) att_labels = list(map(session.run, att_labels)) ctc_labels = list(map(session.run, ctc_labels)) ctc_labels_st = list( map(list2sparsetensor, ctc_labels, [ctc_padded_value] * len(ctc_labels))) inputs_seq_len = list(map(session.run, inputs_seq_len)) att_labels_seq_len = list(map(session.run, att_labels_seq_len)) input_names = list(map(session.run, input_names)) else: ctc_labels_st = list2sparsetensor(ctc_labels, padded_value=ctc_padded_value) yield (inputs, att_labels, ctc_labels_st, inputs_seq_len, att_labels_seq_len, input_names)
def do_train(network, param): """Run training. Args: network: network to train param: A dictionary of parameters """ # Load dataset train_data = Dataset(data_type='train', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=True) dev_data_step = Dataset(data_type='dev', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=False) dev_data_epoch = Dataset(data_type='dev', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders network.inputs = tf.placeholder(tf.float32, shape=[None, None, network.input_size], name='input') indices_pl = tf.placeholder(tf.int64, name='indices') values_pl = tf.placeholder(tf.int32, name='values') shape_pl = tf.placeholder(tf.int64, name='shape') network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl) network.inputs_seq_len = tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len') network.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') network.keep_prob_hidden = tf.placeholder(tf.float32, name='keep_prob_hidden') # Add to the graph each operation (including model definition) loss_op, logits = network.compute_loss(network.inputs, network.labels, network.inputs_seq_len, network.keep_prob_input, network.keep_prob_hidden) train_op = network.train(loss_op, optimizer=param['optimizer'], learning_rate_init=float( param['learning_rate']), is_scheduled=False) decode_op = network.decoder(logits, network.inputs_seq_len, decode_type='beam_search', beam_width=20) ler_op = network.compute_ler(decode_op, network.labels) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_train_loss, csv_dev_loss = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Make mini-batch generator mini_batch_train = train_data.next_batch() mini_batch_dev = dev_data_step.next_batch() # Train model iter_per_epoch = int(train_data.data_num / param['batch_size']) train_step = train_data.data_num / param['batch_size'] if (train_step) != int(train_step): iter_per_epoch += 1 max_steps = iter_per_epoch * param['num_epoch'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) with tf.device('/cpu:0'): inputs, labels, inputs_seq_len, _ = mini_batch_train.__next__( ) feed_dict_train = { network.inputs: inputs, network.labels: list2sparsetensor(labels, padded_value=-1), network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.lr: float(param['learning_rate']) } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % 200 == 0: # Create feed dictionary for next mini batch (dev) with tf.device('/cpu:0'): inputs, labels, inputs_seq_len, _ = mini_batch_dev.__next__( ) feed_dict_dev = { network.inputs: inputs, network.labels: list2sparsetensor(labels, padded_value=-1), network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden } # Compute loss_ loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_train_loss.append(loss_train) csv_dev_loss.append(loss_dev) # Change to evaluation mode feed_dict_train[network.keep_prob_input] = 1.0 feed_dict_train[network.keep_prob_hidden] = 1.0 feed_dict_dev[network.keep_prob_input] = 1.0 feed_dict_dev[network.keep_prob_hidden] = 1.0 # Compute accuracy & update event file ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)' % (step + 1, loss_train, loss_dev, ler_train, ler_dev, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') save_path = saver.save(sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) if epoch >= 5: start_time_eval = time.time() print('=== Dev Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op, network=network, dataset=dev_data_epoch, label_type=param['label_type'], eval_batch_size=param['batch_size']) if param['label_type'] in ['kana', 'kanji']: print(' CER: %f %%' % (cer_dev_epoch * 100)) else: print(' PER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < error_best: error_best = cer_dev_epoch print('■■■ ↑Best Score↑ ■■■') duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss, ler save_loss(csv_steps, csv_train_loss, csv_dev_loss, save_path=network.model_dir) save_ler(csv_steps, csv_ler_train, csv_ler_dev, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')
def do_eval_per(session, decode_op, per_op, network, dataset, label_type, eos_index, eval_batch_size=None, is_progressbar=False, is_multitask=False): """Evaluate trained model by Phone Error Rate. Args: session: session of training model decode_op: operation for decoding per_op: operation for computing phone error rate network: network to evaluate dataset: An instance of a `Dataset' class label_type: string, phone39 or phone48 or phone61 eos_index: int, the index of <EOS> class eval_batch_size: int, the batch size when evaluating the model is_progressbar: if True, visualize the progressbar is_multitask: if True, evaluate the multitask model Returns: per_mean: An average of PER """ if eval_batch_size is not None: batch_size = eval_batch_size else: batch_size = dataset.batch_size train_label_type = label_type eval_label_type = dataset.label_type num_examples = dataset.data_num iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 per_mean = 0 # Make data generator mini_batch = dataset.next_batch(batch_size=batch_size) train_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \ train_label_type + '_to_num.txt' eval_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \ train_label_type + '_to_num.txt' phone2num_39_map_file_path = '../metrics/mapping_files/ctc/phone39_to_num.txt' phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt' for step in wrap_iterator(range(iteration), is_progressbar): # Create feed dictionary for next mini-batch if not is_multitask: inputs, labels_true, inputs_seq_len, _, _ = mini_batch.__next__() else: inputs, _, labels_true, inputs_seq_len, _, _ = mini_batch.__next__( ) feed_dict = { network.inputs: inputs, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: 1.0, network.keep_prob_hidden: 1.0 } batch_size_each = len(inputs_seq_len) # Evaluate by 39 phones predicted_ids = session.run(decode_op, feed_dict=feed_dict) labels_pred_mapped, labels_true_mapped = [], [] for i_batch in range(batch_size_each): ############### # Hypothesis ############### # Convert from num to phone (-> list of phone strings) phone_pred_list = num2phone( predicted_ids[i_batch], train_phone2num_map_file_path).split(' ') # Mapping to 39 phones (-> list of phone strings) phone_pred_list = map_to_39phone(phone_pred_list, train_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_pred_list = phone2num(phone_pred_list, phone2num_39_map_file_path) labels_pred_mapped.append(phone_pred_list) ############### # Reference ############### # Convert from num to phone (-> list of phone strings) phone_true_list = num2phone( labels_true[i_batch], eval_phone2num_map_file_path).split(' ') # Mapping to 39 phones (-> list of phone strings) phone_true_list = map_to_39phone(phone_true_list, eval_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_true_list = phone2num(phone_true_list, phone2num_39_map_file_path) labels_true_mapped.append(phone_true_list) # Compute edit distance labels_true_st = list2sparsetensor(labels_true_mapped, padded_value=eos_index) labels_pred_st = list2sparsetensor(labels_pred_mapped, padded_value=eos_index) per_each = compute_edit_distance(session, labels_true_st, labels_pred_st) per_mean += per_each * batch_size_each per_mean /= dataset.data_num return per_mean