def decode_test(session, decode_op, network, dataset, label_type, rate=1.0, is_multitask=False): """Visualize label outputs. Args: session: session of training model decode_op: operation for decoding network: network to evaluate dataset: Dataset class label_type: phone39 or phone48 or phone61 or character rate: rate of evaluation data to use is_multitask: if True, evaluate the multitask model """ batch_size = 1 num_examples = dataset.data_num * rate iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 map_file_path_phone = '../evaluation/mapping_files/ctc/phone2num_' + \ label_type[5:7] + '.txt' map_file_path_char = '../evaluation/mapping_files/ctc/char2num.txt' for step in range(iteration): # Create feed dictionary for next mini batch inputs, labels_true, seq_len, input_names = dataset.next_batch( batch_size=batch_size) # if is_multitask: # if label_type == 'character': # labels_true = labels_true[0] # else: # labels_true = labels_true[1] feed_dict = { network.inputs_pl: inputs, network.seq_len_pl: seq_len, network.keep_prob_input_pl: 1.0, network.keep_prob_hidden_pl: 1.0 } # Visualize batch_size_each = len(labels_true) labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): if label_type == 'character': print('-----wav: %s-----' % input_names[i_batch]) print('True: %s' % num2char( labels_true[i_batch], map_file_path_char)) print('Pred: %s' % num2char( labels_pred[i_batch], map_file_path_char)) else: # Decode test (39 phones) print('-----wav: %s-----' % input_names[i_batch]) print('True: %s' % num2phone( labels_true[i_batch], map_file_path_phone)) print('Pred: %s' % num2phone( labels_pred[i_batch], map_file_path_phone))
def decode_test(session, decode_op, network, dataset, label_type, save_path=None): """Visualize label outputs of CTC model. Args: session: session of training model decode_op: operation for decoding network: network to evaluate dataset: An instance of a `Dataset` class label_type: string, phone39 or phone48 or phone61 or character save_path: path to save decoding results """ # Batch size is expected to be 1 iteration = dataset.data_num # Make data generator mini_batch = dataset.next_batch(batch_size=1) if label_type == 'character': map_file_path = '../metric/mapping_files/ctc/char2num.txt' else: map_file_path = '../metric/mapping_files/ctc/phone2num_' + \ label_type[5:7] + '.txt' if save_path is not None: sys.stdout = open(join(network.model_dir, 'decode.txt'), 'w') for step in range(iteration): # Create feed dictionary for next mini batch inputs, labels_true_st, inputs_seq_len, input_names = mini_batch.__next__( ) feed_dict = { network.inputs: inputs, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: 1.0, network.keep_prob_hidden: 1.0 } # Visualize labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_true = sparsetensor2list(labels_true_st, batch_size=1) labels_pred = sparsetensor2list(labels_pred_st, batch_size=1) if label_type == 'character': print('----- wav: %s -----' % input_names[0]) print('True: %s' % num2char(labels_true[0], map_file_path)) print('Pred: %s' % num2char(labels_pred[0], map_file_path)) else: print('----- wav: %s -----' % input_names[0]) print('True: %s' % num2phone(labels_true[0], map_file_path)) print('Pred: %s' % num2phone(labels_pred[0], map_file_path))
def check_reading(self, num_gpu, is_sorted): print('----- num_gpu: ' + str(num_gpu) + ', is_sorted: ' + str(is_sorted) + ' -----') batch_size = 64 dataset = DataSet(data_type='train', label_type_second='phone61', batch_size=batch_size, num_stack=3, num_skip=3, is_sorted=is_sorted, is_progressbar=True, num_gpu=num_gpu) tf.reset_default_graph() with tf.Session().as_default() as sess: print('=> Reading mini-batch...') map_file_path_char = '../metric/mapping_files/ctc/char2num.txt' map_file_path_phone = '../metric/mapping_files/ctc/phone2num_61.txt' mini_batch = dataset.next_batch(session=sess) iter_per_epoch = int(dataset.data_num / (batch_size * num_gpu)) + 1 for i in range(iter_per_epoch + 1): inputs, labels_char_st, labels_phone_st, inputs_seq_len, input_names = mini_batch.__next__( ) if num_gpu > 1: for inputs_gpu in inputs: print(inputs_gpu.shape) labels_char_st = labels_char_st[0] labels_phone_st = labels_phone_st[0] labels_char = sparsetensor2list(labels_char_st, batch_size=len(inputs)) labels_phone = sparsetensor2list(labels_phone_st, batch_size=len(inputs)) if num_gpu == 1: for inputs_i, labels_i in zip(inputs, labels_char): if len(inputs_i) < len(labels_i): print(len(inputs_i)) print(len(labels_i)) raise ValueError for inputs_i, labels_i in zip(inputs, labels_phone): if len(inputs_i) < len(labels_i): print(len(inputs_i)) print(len(labels_i)) raise ValueError str_true_char = num2char(labels_char[0], map_file_path_char) str_true_char = re.sub(r'_', ' ', str_true_char) str_true_phone = num2phone(labels_phone[0], map_file_path_phone) print(str_true_char) print(str_true_phone) print('-----')
def decode_test(session, decode_op, network, dataset, label_type, is_test, eval_batch_size=None, rate=1.0): """Visualize label outputs. Args: session: session of training model decode_op: operation for decoding network: network to evaluate dataset: Dataset class label_type: phone or character or kanji is_test: set to True when evaluating by the test set eval_batch_size: batch size on evaluation rate: rate of evaluation data to use """ batch_size = 1 num_examples = dataset.data_num * rate iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 map_file_path_phone = '../evaluation/mapping_files/ctc/phone2num.txt' if label_type == 'character': map_file_path = '../evaluation/mapping_files/ctc/char2num.txt' elif label_type == 'kanji': map_file_path = '../evaluation/mapping_files/ctc/kanji2num.txt' for step in range(iteration): # Create feed dictionary for next mini batch inputs, labels_true, seq_len, input_names = dataset.next_batch( batch_size=batch_size) feed_dict = { network.inputs_pl: inputs, network.seq_len_pl: seq_len, network.keep_prob_input_pl: 1.0, network.keep_prob_hidden_pl: 1.0 } # Visualize batch_size_each = len(labels_true) labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): if label_type in ['character', 'kanji']: # Convert from list to string str_pred = num2char(labels_pred[i_batch], map_file_path) if label_type == 'kanji' and is_test: str_true = labels_true[i_batch] else: str_true = num2char(labels_true[i_batch], map_file_path) print('-----wav: %s-----' % input_names[i_batch]) print('True: %s' % str_true) print('Pred: %s' % str_pred) elif label_type == 'phone': print('-----wav: %s-----' % input_names[i_batch]) print('True: %s' % num2phone(labels_true[i_batch], map_file_path_phone)) print('Pred: %s' % num2phone(labels_pred[i_batch], map_file_path_phone))
def do_eval_per(session, decode_op, per_op, network, dataset, label_type, eval_batch_size=1, rate=1.0, is_progressbar=False, is_multitask=False): """Evaluate trained model by Phone Error Rate. Args: session: session of training model decode_op: operation for decoding per_op: operation for computing phone error rate network: network to evaluate dataset: `Dataset' class label_type: phone39 or phone48 or phone61 or character eval_batch_size: batch size on evaluation rate: A float value. Rate of evaluation data to use is_progressbar: if True, evaluate during training, else during restoring is_multitask: if True, evaluate the multitask model Returns: per_global: phone error rate """ if label_type not in ['phone39', 'phone48', 'phone61']: raise ValueError( 'data_type is "phone39" or "phone48" or "phone61".') batch_size = eval_batch_size num_examples = dataset.data_num * rate iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 per_global = 0 p2n_map_file_path = '../evaluation/mapping_files/ctc/phone2num_' + \ label_type[5:7] + '.txt' p2n39_map_file_path = '../evaluation/mapping_files/ctc/phone2num_39.txt' p2p_map_file_path = '../evaluation/mapping_files/phone2phone.txt' iterator = tqdm(range(iteration)) if is_progressbar else range(iteration) for step in iterator: # Create feed dictionary for next mini batch if not is_multitask: inputs, labels_true, seq_len, _ = dataset.next_batch( batch_size=batch_size) else: inputs, _, labels_true, seq_len, _ = dataset.next_batch( batch_size=batch_size) feed_dict = { network.inputs_pl: inputs, network.seq_len_pl: seq_len, network.keep_prob_input_pl: 1.0, network.keep_prob_hidden_pl: 1.0 } batch_size_each = len(labels_true) if False: # evaluate by 61 phones per_local = session.run(per_op, feed_dict=feed_dict) per_global += per_local * batch_size_each else: # evaluate by 39 phones labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): # Convert to phone (list of phone strings) phone_pred_seq = num2phone( labels_pred[i_batch], p2n_map_file_path) phone_pred_list = phone_pred_seq.split(' ') # Mapping to 39 phones (list of phone strings) phone_pred_list = map_to_39phone( phone_pred_list, label_type, p2p_map_file_path) # Convert to num (list of phone indices) phone_pred_list = phone2num( phone_pred_list, p2n39_map_file_path) labels_pred[i_batch] = phone_pred_list # Compute edit distance labels_true_st = list2sparsetensor(labels_true) labels_pred_st = list2sparsetensor(labels_pred) per_local = compute_edit_distance( session, labels_true_st, labels_pred_st) per_global += per_local * batch_size_each per_global /= dataset.data_num print(' Phone Error Rate: %f %%' % (per_global * 100)) return per_global
def do_eval_per(session, decode_op, per_op, network, dataset, train_label_type, eval_batch_size=None, is_progressbar=False, is_multitask=False): """Evaluate trained model by Phone Error Rate. Args: session: session of training model decode_op: operation for decoding per_op: operation for computing phone error rate network: network to evaluate dataset: An instance of a `Dataset' class train_label_type: string, phone39 or phone48 or phone61 eval_batch_size: int, the batch size when evaluating the model is_progressbar: if True, visualize the progressbar is_multitask: if True, evaluate the multitask model Returns: per_global: An average of PER """ if eval_batch_size is not None: batch_size = eval_batch_size else: batch_size = dataset.batch_size data_label_type = dataset.label_type num_examples = dataset.data_num iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 per_global = 0 # Make data generator mini_batch = dataset.next_batch(batch_size=batch_size) phone2num_map_file_path = '../metric/mapping_files/ctc/phone2num_' + \ train_label_type[5:7] + '.txt' phone2num_39_map_file_path = '../metric/mapping_files/ctc/phone2num_39.txt' phone2phone_map_file_path = '../metric/mapping_files/phone2phone.txt' for step in wrap_iterator(range(iteration), is_progressbar): # Create feed dictionary for next mini batch if not is_multitask: inputs, labels_true_st, inputs_seq_len, _ = mini_batch.__next__() else: inputs, _, labels_true_st, inputs_seq_len, _ = mini_batch.__next__( ) feed_dict = { network.inputs: inputs, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: 1.0, network.keep_prob_hidden: 1.0 } batch_size_each = len(inputs_seq_len) if False: # Evaluate by the same phones as phones used when training per_local = session.run(per_op, feed_dict=feed_dict) per_global += per_local * batch_size_each else: # Evaluate by 39 phones labels_pred_st = session.run(decode_op, feed_dict=feed_dict) labels_true = sparsetensor2list(labels_true_st, batch_size_each) labels_pred = sparsetensor2list(labels_pred_st, batch_size_each) for i_batch in range(batch_size_each): # Convert from num to phone (-> list of phone strings) phone_pred_seq = num2phone(labels_pred[i_batch], phone2num_map_file_path) phone_pred_list = phone_pred_seq.split(' ') # Mapping to 39 phones (-> list of phone strings) phone_pred_list = map_to_39phone(phone_pred_list, train_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_pred_list = phone2num(phone_pred_list, phone2num_39_map_file_path) labels_pred[i_batch] = phone_pred_list if data_label_type != 'phone39': # Convert from num to phone (-> list of phone strings) phone_true_seq = num2phone(labels_true[i_batch], phone2num_map_file_path) phone_true_list = phone_true_seq.split(' ') # Mapping to 39 phones (-> list of phone strings) phone_true_list = map_to_39phone( phone_true_list, data_label_type, phone2phone_map_file_path) # Convert from phone to num (-> list of phone indices) phone_true_list = phone2num(phone_true_list, phone2num_39_map_file_path) labels_true[i_batch] = phone_true_list # Compute edit distance labels_true_st = list2sparsetensor(labels_true) labels_pred_st = list2sparsetensor(labels_pred) per_local = compute_edit_distance(session, labels_true_st, labels_pred_st) per_global += per_local * batch_size_each per_global /= dataset.data_num return per_global
def do_train(network, optimizer, learning_rate, batch_size, epoch_num, label_type, num_stack, num_skip, train_data_size): """Run training. Args: network: network to train optimizer: string, the name of optimizer. ex.) adam, rmsprop learning_rate: initial learning rate batch_size: size of mini batch epoch_num: epoch num to train label_type: phone or character or kanji num_stack: int, the number of frames to stack num_skip: int, the number of frames to skip train_data_size: default or large """ # Load dataset train_data = DataSet(data_type='train', label_type=label_type, train_data_size=train_data_size, num_stack=num_stack, num_skip=num_skip, is_sorted=True) dev_data = DataSet(data_type='dev', label_type=label_type, train_data_size=train_data_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False) eval1_data = DataSet(data_type='eval1', label_type=label_type, train_data_size=train_data_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False) eval2_data = DataSet(data_type='eval2', label_type=label_type, train_data_size=train_data_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False) eval3_data = DataSet(data_type='eval3', label_type=label_type, train_data_size=train_data_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define model network.define() # NOTE: define model under tf.Graph() # Add to the graph each operation loss_op = network.loss() train_op = network.train(optimizer=optimizer, learning_rate_init=learning_rate, is_scheduled=False) decode_op = network.decoder(decode_type='beam_search', beam_width=20) per_op = network.ler(decode_op) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps = [] csv_train_loss = [] csv_dev_loss = [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Train model iter_per_epoch = int(train_data.data_num / batch_size) if (train_data.data_num / batch_size) != int( train_data.data_num / batch_size): iter_per_epoch += 1 max_steps = iter_per_epoch * epoch_num start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) inputs, labels, seq_len, _ = train_data.next_batch( batch_size=batch_size) indices, values, dense_shape = list2sparsetensor(labels) feed_dict_train = { network.inputs_pl: inputs, network.label_indices_pl: indices, network.label_values_pl: values, network.label_shape_pl: dense_shape, network.seq_len_pl: seq_len, network.keep_prob_input_pl: network.dropout_ratio_input, network.keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr_pl: learning_rate } # Create feed dictionary for next mini batch (dev) inputs, labels, seq_len, _ = dev_data.next_batch( batch_size=batch_size) indices, values, dense_shape = list2sparsetensor(labels) feed_dict_dev = { network.inputs_pl: inputs, network.label_indices_pl: indices, network.label_values_pl: values, network.label_shape_pl: dense_shape, network.seq_len_pl: seq_len, network.keep_prob_input_pl: network.dropout_ratio_input, network.keep_prob_hidden_pl: network.dropout_ratio_hidden } # Update parameters & compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_train_loss.append(loss_train) csv_dev_loss.append(loss_dev) if (step + 1) % 100 == 0: # Change feed dict for evaluation feed_dict_train[network.keep_prob_input_pl] = 1.0 feed_dict_train[network.keep_prob_hidden_pl] = 1.0 feed_dict_dev[network.keep_prob_input_pl] = 1.0 feed_dict_dev[network.keep_prob_hidden_pl] = 1.0 # Compute accuracy & \update event file ler_train, summary_str_train = sess.run( [per_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev, labels_st = sess.run( [per_op, summary_dev, decode_op], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() # Decode try: labels_pred = sparsetensor2list(labels_st, batch_size) except: labels_pred = [[0] * batch_size] duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)' % (step + 1, loss_train, loss_dev, ler_train, ler_dev, duration_step / 60)) # print('Step %d: loss = %.3f / ler = %.4f (%.3f min)' % # (step + 1, loss_train, ler_train, duration_step / 60)) if label_type == 'kanji': map_file_path = '../evaluation/mapping_files/ctc/kanji2num.txt' print('True: %s' % num2char(labels[-1], map_file_path)) print('Pred: %s' % num2char(labels_pred[-1], map_file_path)) elif label_type == 'character': map_file_path = '../evaluation/mapping_files/ctc/char2num.txt' print('True: %s' % num2char(labels[-1], map_file_path)) print('Pred: %s' % num2char(labels_pred[-1], map_file_path)) elif label_type == 'phone': map_file_path = '../evaluation/mapping_files/ctc/phone2num.txt' print('True: %s' % num2phone(labels[-1], map_file_path)) print('Pred: %s' % num2phone(labels_pred[-1], map_file_path)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') save_path = saver.save(sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) start_time_eval = time.time() if label_type in ['character', 'kanji']: print('■Dev Evaluation:■') error_epoch = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=dev_data, label_type=label_type, eval_batch_size=batch_size) if error_epoch < error_best: error_best = error_epoch print('■■■ ↑Best Score (CER)↑ ■■■') print('■eval1 Evaluation:■') cer_eval1 = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=eval1_data, label_type=label_type, is_test=True, eval_batch_size=batch_size) print('■eval2 Evaluation:■') cer_eval2 = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=eval2_data, label_type=label_type, is_test=- True, eval_batch_size=batch_size) print('■eval3 Evaluation:■') cer_eval3 = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=eval3_data, label_type=label_type, is_test=True, eval_batch_size=batch_size) cer_mean = (cer_eval1 + cer_eval2 + cer_eval3) / 3. print('■Mean:■') print(' CER: %f %%' % (cer_mean * 100)) else: print('■Dev Evaluation:■') error_epoch = do_eval_per(session=sess, per_op=per_op, network=network, dataset=dev_data, eval_batch_size=batch_size) if error_epoch < error_best: error_best = error_epoch print('■■■ ↑Best Score (PER)↑ ■■■') print('■eval1 Evaluation:■') per_eval1 = do_eval_per(session=sess, per_op=per_op, network=network, dataset=eval1_data, eval_batch_size=batch_size) print('■eval2 Evaluation:■') per_eval2 = do_eval_per(session=sess, per_op=per_op, network=network, dataset=eval2_data, eval_batch_size=batch_size) print('■eval3 Evaluation:■') per_eval3 = do_eval_per(session=sess, per_op=per_op, network=network, dataset=eval3_data, eval_batch_size=batch_size) per_mean = (per_eval1 + per_eval2 + per_eval3) / 3. print('■Mean:■') print(' PER: %f %%' % (per_mean * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss save_loss(csv_steps, csv_train_loss, csv_dev_loss, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')