def do_eval(model, params, epoch, beam_width, eval_batch_size, temperature): """Evaluate the model. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore beam_width (int): beam width for beam search. 1 disables beam search, which mean greedy decoding. eval_batch_size (int): the size of mini-batch when evaluation temperature (int): """ if 'temp1' in params['teacher_model_path']: teacher_train_temperature = 1 elif 'temp2' in params['teacher_model_path']: teacher_train_temperature = 2 print('=' * 40) print(' frame stack %d' % int(params['num_stack'])) print(' splice %d' % int(params['splice'])) print(' beam width: %d' % beam_width) print(' temperature (teacher, training): %d' % teacher_train_temperature) print(' temperature (teacher, inference): %d' % params['teacher_temperature']) print(' temperature (training): %d' % params['student_temperature']) print(' temperature (inference): %d' % temperature) print('=' * 40) # Load dataset test_clean_data = Dataset(data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == -1 else eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_other_data = Dataset(data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) with tf.name_scope('tower_gpu0') as scope: # Define placeholders model.create_placeholders_ctc() # Add to the graph each operation (including model definition) _, logits = model.compute_ctc_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0], scope, softmax_temperature=temperature, # this is for training # is_training=False) is_training=True) logits /= temperature decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') print('Test Data Evaluation:') cer_clean_test, wer_clean_test = do_eval_cer( session=sess, decode_ops=[decode_op], model=model, dataset=test_clean_data, label_type=params['label_type'], is_test=True, eval_batch_size=eval_batch_size, progressbar=True) print(' CER (clean): %f %%' % (cer_clean_test * 100)) print(' WER (clean): %f %%' % (wer_clean_test * 100)) cer_other_test, wer_other_test = do_eval_cer( session=sess, decode_ops=[decode_op], model=model, dataset=test_other_data, label_type=params['label_type'], is_test=True, eval_batch_size=eval_batch_size, progressbar=True) print(' CER (other): %f %%' % (cer_other_test * 100)) print(' WER (other): %f %%' % (wer_other_test * 100))
def check_loading(self, label_type, data_type='dev_clean', shuffle=False, sort_utt=False, sort_stop_epoch=None, frame_stacking=False, splice=1, num_gpu=1): print('========================================') print(' label_type: %s' % label_type) print(' data_type: %s' % data_type) print(' shuffle: %s' % str(shuffle)) print(' sort_utt: %s' % str(sort_utt)) print(' sort_stop_epoch: %s' % str(sort_stop_epoch)) print(' frame_stacking: %s' % str(frame_stacking)) print(' splice: %d' % splice) print(' num_gpu: %d' % num_gpu) print('========================================') num_stack = 3 if frame_stacking else 1 num_skip = 3 if frame_stacking else 1 dataset = Dataset( data_type=data_type, train_data_size='train_clean100', label_type=label_type, batch_size=64, max_epoch=1, splice=splice, num_stack=num_stack, num_skip=num_skip, shuffle=shuffle, sort_utt=sort_utt, sort_stop_epoch=sort_stop_epoch, progressbar=True, num_gpu=num_gpu) print('=> Loading mini-batch...') if label_type == 'character': map_file_path = '../../metrics/mapping_files/ctc/character.txt' elif label_type == 'character_capital_divide': map_file_path = '../../metrics/mapping_files/ctc/character_capital.txt' elif label_type == 'word': map_file_path = '../../metrics/mapping_files/ctc/word_' + \ dataset.train_data_size + '.txt' idx2char = Idx2char(map_file_path) idx2word = Idx2word(map_file_path) for data, is_new_epoch in dataset: inputs, labels, inputs_seq_len, input_names = data if not self.length_check: for i, l in zip(inputs[0], labels[0]): if len(i) < len(l): raise ValueError( 'input length must be longer than label length.') self.length_check = True if num_gpu > 1: for inputs_gpu in inputs: print(inputs_gpu.shape) if label_type == 'word': if 'test' not in data_type: str_true = ' '.join(idx2word(labels[0][0])) else: word_list = np.delete(labels[0][0], np.where( labels[0][0] == None), axis=0) str_true = ' '.join(word_list) else: str_true = idx2char(labels[0][0]) str_true = re.sub(r'_', ' ', str_true) print('----- %s (epoch: %.3f) -----' % (input_names[0][0], dataset.epoch_detail)) print(inputs[0].shape) print(str_true) if dataset.epoch_detail >= 0.05: break
def do_decode(model, params, epoch, beam_width): """Decode the CTC outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore beam_width (int): beam_width (int, optional): beam width for beam search. 1 disables beam search, which mean greedy decoding. """ # Load dataset test_clean_data = Dataset(data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=1, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_other_data = Dataset(data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=1, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss(model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0]) decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') # Visualize decode_test(session=sess, decode_op=decode_op, model=model, dataset=test_clean_data, label_type=params['label_type'], train_data_size=params['train_data_size'], save_path=None) # save_path=model.save_path) decode_test(session=sess, decode_op=decode_op, model=model, dataset=test_other_data, label_type=params['label_type'], train_data_size=params['train_data_size'], save_path=None)
def do_eval(save_paths, params, beam_width, temperature_infer, result_save_path): """Evaluate the model. Args: save_paths (list): params (dict): A dictionary of parameters epoch_list (list): list of the epoch to restore beam_width (int): beam width for beam search. 1 disables beam search, which mean greedy decoding. eval_batch_size (int): the size of mini-batch when evaluation temperature_infer (int): temperature in the inference stage result_save_path (string, optional): """ if 'temp1' in save_paths[0]: temperature_train = 1 elif 'temp2' in save_paths[0]: temperature_train = 2 else: raise ValueError if result_save_path is not None: sys.stdout = open( join( result_save_path, '4models_traintemp' + str(temperature_train) + '_inftemp' + str(temperature_infer) + '.log'), 'w') print('=' * 30) print(' frame stack %d' % int(params['num_stack'])) print(' beam width: %d' % beam_width) print(' ensemble: %d' % len(save_paths)) print(' temperature (training): %d' % temperature_train) print(' temperature (inference): %d' % temperature_infer) print('=' * 30) # Load dataset test_clean_data = Dataset(data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=1, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) test_other_data = Dataset(data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=1, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) print('Test Data Evaluation:') cer_clean_test, wer_clean_test = do_eval_cer( save_paths=save_paths, dataset=test_clean_data, data_type='test_clean', label_type=params['label_type'], num_classes=params['num_classes'] + 1, beam_width=beam_width, temperature_infer=temperature_infer, is_test=True, progressbar=True) print(' CER (clean): %f %%' % (cer_clean_test * 100)) print(' WER (clean): %f %%' % (wer_clean_test * 100)) cer_other_test, wer_other_test = do_eval_cer( save_paths=save_paths, dataset=test_other_data, data_type='test_other', label_type=params['label_type'], num_classes=params['num_classes'] + 1, beam_width=beam_width, temperature_infer=temperature_infer, is_test=True, progressbar=True) print(' CER (other): %f %%' % (cer_other_test * 100)) print(' WER (other): %f %%' % (wer_other_test * 100))
def do_eval(model, params, epoch, eval_batch_size, beam_width): """Evaluate the model. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch when evaluation beam_width (int): beam_width (int, optional): beam width for beam search. 1 disables beam search, which mean greedy decoding. """ # Load dataset test_clean_data = Dataset(data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == -1 else eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_other_data = Dataset(data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == -1 else eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss(model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0], is_training=False) decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) posteriors_op = model.posteriors(logits) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') print('Test Data Evaluation:') if 'char' in params['label_type']: if DECODER_TYPE == 1: cer_clean_test, wer_clean_test = do_eval_cer( session=sess, decode_ops=[decode_op], model=model, dataset=test_clean_data, label_type=params['label_type'], is_test=True, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (clean): %f %%' % (wer_clean_test * 100)) print(' CER (clean): %f %%' % (cer_clean_test * 100)) cer_other_test, wer_other_test = do_eval_cer( session=sess, decode_ops=[decode_op], model=model, dataset=test_other_data, label_type=params['label_type'], is_test=True, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (other): %f %%' % (wer_other_test * 100)) print(' CER (other): %f %%' % (cer_other_test * 100)) elif DECODER_TYPE == 2: cer_clean_test, wer_clean_test = do_eval_cer2( session=sess, # beam_width=beam_width, beam_width=20, posteriors_ops=[posteriors_op], model=model, dataset=test_clean_data, label_type=params['label_type'], is_test=True, eval_batch_size=20, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (clean): %f %%' % (wer_clean_test * 100)) print(' CER (clean): %f %%' % (cer_clean_test * 100)) cer_other_test, wer_other_test = do_eval_cer2( session=sess, # beam_width=beam_width, beam_width=20, posteriors_ops=[posteriors_op], model=model, dataset=test_other_data, label_type=params['label_type'], is_test=True, eval_batch_size=20, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (other): %f %%' % (wer_other_test * 100)) print(' CER (other): %f %%' % (cer_other_test * 100)) else: wer_clean_test = do_eval_wer( session=sess, decode_ops=[decode_op], model=model, dataset=test_clean_data, train_data_size=params['train_data_size'], is_test=True, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (clean): %f %%' % (wer_clean_test * 100)) wer_other_test = do_eval_wer( session=sess, decode_ops=[decode_op], model=model, dataset=test_other_data, train_data_size=params['train_data_size'], is_test=True, # eval_batch_size=eval_batch_size, progressbar=True) print(' WER (other): %f %%' % (wer_other_test * 100))
def check(self, label_type, data_type='dev_clean', shuffle=False, sort_utt=False, sort_stop_epoch=None, frame_stacking=False, splice=1, num_gpu=1): print('========================================') print(' label_type: %s' % label_type) print(' data_type: %s' % data_type) print(' shuffle: %s' % str(shuffle)) print(' sort_utt: %s' % str(sort_utt)) print(' sort_stop_epoch: %s' % str(sort_stop_epoch)) print(' frame_stacking: %s' % str(frame_stacking)) print(' splice: %d' % splice) print(' num_gpu: %d' % num_gpu) print('========================================') num_stack = 3 if frame_stacking else 1 num_skip = 3 if frame_stacking else 1 dataset = Dataset(data_type=data_type, train_data_size='train100h', label_type=label_type, batch_size=64, max_epoch=2, splice=splice, num_stack=num_stack, num_skip=num_skip, shuffle=shuffle, sort_utt=sort_utt, sort_stop_epoch=sort_stop_epoch, progressbar=True, num_gpu=num_gpu) print('=> Loading mini-batch...') if label_type == 'character': map_file_path = '../../metrics/mapping_files/character.txt' else: map_file_path = '../../metrics/mapping_files/' + label_type + '_' + \ dataset.train_data_size + '.txt' idx2char = Idx2char(map_file_path) idx2word = Idx2word(map_file_path) for data, is_new_epoch in dataset: inputs, labels, inputs_seq_len, input_names = data if data_type == 'train': for i, l in zip(inputs[0], labels[0]): if len(i) < len(l): raise ValueError( 'input length must be longer than label length.') if num_gpu > 1: for inputs_gpu in inputs: print(inputs_gpu.shape) if 'test' in data_type: str_true = labels[0][0][0] else: if 'word' in label_type: str_true = '_'.join(idx2word(labels[0][0])) else: str_true = idx2char(labels[0][0]) print('----- %s (epoch: %.3f) -----' % (input_names[0][0], dataset.epoch_detail)) print(inputs[0].shape) print(str_true) if dataset.epoch_detail >= 0.1: break
def do_train(model, params, gpu_indices): """Run CTC training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ # Load dataset train_data = Dataset(data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_data_clean = Dataset(data_type='dev_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) dev_data_other = Dataset(data_type='dev_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.keep_prob_input_pl_list[i_gpu], model.keep_prob_hidden_pl_list[i_gpu], model.keep_prob_output_pl_list[i_gpu], scope) tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars, _clip_norm=False) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower decode_op_tower = model.decoder( tower_logits, model.inputs_seq_len_pl_list[i_gpu], beam_width=params['beam_width']) decode_ops.append(decode_op_tower) ler_op_tower = model.compute_ler( decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(model.save_path, sess.graph) # Initialize parameters sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels, inputs_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_train[ model.labels_pl_list[i_gpu]] = list2sparsetensor( labels[i_gpu], padded_value=train_data.padded_value) feed_dict_train[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_train[model.keep_prob_input_pl_list[ i_gpu]] = params['dropout_input'] feed_dict_train[model.keep_prob_hidden_pl_list[ i_gpu]] = params['dropout_hidden'] feed_dict_train[model.keep_prob_output_pl_list[ i_gpu]] = params['dropout_output'] feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int( params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) (inputs, labels, inputs_seq_len, _), _ = dev_data_other.next() feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_dev[ model.labels_pl_list[i_gpu]] = list2sparsetensor( labels[i_gpu], padded_value=dev_data_other.padded_value) feed_dict_dev[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_dev[ model.keep_prob_input_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_hidden_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_output_pl_list[i_gpu]] = 1.0 # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.keep_prob_input_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_hidden_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_output_pl_list[i_gpu]] = 1.0 # Compute accuracy & update event files ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) # Save model (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save(sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() if params['label_type'] != 'word': print('=== Dev Data Evaluation ===') # Dev-clean ler_dev_clean_epoch, wer_dev_clean_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_data_clean, label_type=params['label_type'], eval_batch_size=params['batch_size']) print(' CER (clean): %f %%' % (ler_dev_clean_epoch * 100)) print(' WER (clean): %f %%' % (wer_dev_clean_epoch * 100)) # Dev-other ler_dev_other_epoch, wer_dev_other_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_data_other, label_type=params['label_type'], eval_batch_size=params['batch_size']) print(' CER (other): %f %%' % (ler_dev_other_epoch * 100)) print(' WER (other): %f %%' % (wer_dev_other_epoch * 100)) if ler_dev_other_epoch < ler_dev_best: ler_dev_best = ler_dev_other_epoch print('■■■ ↑Best Score (CER)↑ ■■■') else: print('=== Dev Data Evaluation ===') # Dev-clean ler_dev_clean_epoch = do_eval_wer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_data_clean, train_data_size=params['train_data_size'], eval_batch_size=params['batch_size']) print(' WER (clean): %f %%' % (ler_dev_clean_epoch * 100)) # Dev-other ler_dev_other_epoch = do_eval_wer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_data_other, train_data_size=params['train_data_size'], eval_batch_size=params['batch_size']) print(' WER (other): %f %%' % (ler_dev_other_epoch * 100)) if ler_dev_other_epoch < ler_dev_best: ler_dev_best = ler_dev_other_epoch print('■■■ ↑Best Score (WER)↑ ■■■') duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=ler_dev_other_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_save(model, params, epoch, eval_batch_size): """Save the CTC outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset train_data = Dataset(data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0], softmax_temperature=params['softmax_temperature']) posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') for data, is_new_epoch in train_data: # Create feed dictionary for next mini batch inputs, _, inputs_seq_len, input_names = data feed_dict = { model.inputs_pl_list[0]: inputs[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } batch_size, max_frame_num = inputs[0].shape[:2] posteriors = sess.run(posteriors_op, feed_dict=feed_dict) posteriors = posteriors.reshape(-1, max_frame_num, model.num_classes) for i_batch in range(batch_size): prob = posteriors[i_batch][:int(inputs_seq_len[0][i_batch]), :] # Save as a npy file np.save( mkdir_join(model.save_path, 'probs', input_names[0][i_batch]), prob) if is_new_epoch: break
def do_save(model, params, epoch, eval_batch_size, temperature): """Save the CTC outputs. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation temperature (int): """ print('=' * 30) print(' frame stack %d' % int(params['num_stack'])) print(' splice %d' % int(params['splice'])) print(' temperature (training): %d' % temperature) print('=' * 30) # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) dev_clean_data = Dataset( data_type='dev_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) dev_other_data = Dataset( data_type='dev_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) test_clean_data = Dataset( data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) test_other_data = Dataset( data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, max_epoch=3, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, num_gpu=1) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) logits /= temperature posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') ######################### # Save soft targets ######################### # train100h # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=train_data, # data_type='train', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'train')) # dev # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=dev_clean_data, # data_type='dev_clean', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_clean')) # save(session=sess, # posteriors_op=posteriors_op, # model=model, # dataset=dev_other_data, # data_type='dev_other', # num_stack=params['num_stack'], # save_prob=False, # save_soft_targets=True, # save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_other')) # test save(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_clean_data, data_type='test_clean', num_stack=params['num_stack'], save_prob=True, save_soft_targets=False, save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_clean')) save(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_other_data, data_type='test_other', num_stack=params['num_stack'], save_prob=True, save_soft_targets=False, save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_other'))
def do_train(model, params, gpu_indices): """Run CTC training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_clean_data = Dataset( data_type='dev_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) dev_other_data = Dataset( data_type='dev_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) test_clean_data = Dataset( data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) test_other_data = Dataset( data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer( params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.keep_prob_pl_list[i_gpu], scope, softmax_temperature=params['softmax_temperature']) # NOTE: tower_logits have NOT been divided by # softmax_temperature tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower decode_op_tower = model.decoder( tower_logits, model.inputs_seq_len_pl_list[i_gpu], beam_width=params['beam_width']) decode_ops.append(decode_op_tower) ler_op_tower = model.compute_ler( decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize parameters sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 not_improved_epoch = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels, inputs_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_train[model.labels_pl_list[i_gpu]] = list2sparsetensor( labels[i_gpu], padded_value=train_data.padded_value) feed_dict_train[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_train[model.keep_prob_pl_list[i_gpu] ] = 1 - float(params['dropout']) feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int(params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) if params['train_data_size'] in ['train100h', 'train460h']: inputs, labels, inputs_seq_len, _ = dev_clean_data.next()[ 0] else: inputs, labels, inputs_seq_len, _ = dev_other_data.next()[ 0] feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_dev[model.labels_pl_list[i_gpu]] = list2sparsetensor( labels[i_gpu], padded_value=dev_other_data.padded_value) feed_dict_dev[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_dev[model.keep_prob_pl_list[i_gpu]] = 1.0 # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[model.keep_prob_pl_list[i_gpu]] = 1.0 # Compute accuracy & update event files ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') # dev-clean cer_dev_clean_epoch, wer_dev_clean_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_clean_data, label_type=params['label_type'], eval_batch_size=params['batch_size']) print(' CER (clean): %f %%' % (cer_dev_clean_epoch * 100)) print(' WER (clean): %f %%' % (wer_dev_clean_epoch * 100)) # dev-other cer_dev_other_epoch, wer_dev_other_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=dev_other_data, label_type=params['label_type'], eval_batch_size=params['batch_size']) print(' CER (other): %f %%' % (cer_dev_other_epoch * 100)) print(' WER (other): %f %%' % (wer_dev_other_epoch * 100)) if params['train_data_size'] in ['train100h', 'train460h']: metric_epoch = cer_dev_clean_epoch else: metric_epoch = cer_dev_other_epoch if metric_epoch < ler_dev_best: ler_dev_best = metric_epoch not_improved_epoch = 0 print('■■■ ↑Best Score (CER)↑ ■■■') # Save model (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') # test-clean cer_test_clean_epoch, wer_test_clean_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=test_clean_data, label_type=params['label_type'], is_test=True, eval_batch_size=params['batch_size']) print(' CER (clean): %f %%' % (cer_test_clean_epoch * 100)) print(' WER (clean): %f %%' % (wer_test_clean_epoch * 100)) # test-other cer_test_other_epoch, wer_test_other_epoch = do_eval_cer( session=sess, decode_ops=decode_ops, model=model, dataset=test_other_data, label_type=params['label_type'], is_test=True, eval_batch_size=params['batch_size']) print(' CER (other): %f %%' % (cer_test_other_epoch * 100)) print(' WER (other): %f %%' % (wer_test_other_epoch * 100)) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=metric_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_plot(model, params, epoch, eval_batch_size): """Plot the CTC posteriors. Args: model: the model to restore params (dict): A dictionary of parameters epoch (int): the epoch to restore eval_batch_size (int): the size of mini-batch in evaluation """ # Load dataset test_clean_data = Dataset( data_type='test_clean', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) test_other_data = Dataset( data_type='test_other', train_data_size=params['train_data_size'], label_type=params['label_type'], batch_size=params['batch_size'] if eval_batch_size == - 1 else eval_batch_size, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) with tf.name_scope('tower_gpu0'): # Define placeholders model.create_placeholders() # Add to the graph each operation (including model definition) _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_hidden_pl_list[0], # softmax_temperature=params['softmax_temperature']) softmax_temperature=10) posteriors_op = model.posteriors(logits, blank_prior=1) # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(model.save_path) # If check point exists if ckpt: # Use last saved model model_path = ckpt.model_checkpoint_path if epoch != -1: model_path = model_path.split('/')[:-1] model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch) saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') plot(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_clean_data, label_type=params['label_type'], num_stack=params['num_stack'], # save_path=None) save_path=mkdir_join(model.save_path, 'ctc_output', 'test-clean')) plot(session=sess, posteriors_op=posteriors_op, model=model, dataset=test_other_data, label_type=params['label_type'], num_stack=params['num_stack'], # save_path=None) save_path=mkdir_join(model.save_path, 'ctc_output', 'test-other'))