def do_train(model, params): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: model: the model to train params (dict): A dictionary of parameters """ # Load dataset train_data = Dataset(data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], eos_index=params['eos_index'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) dev_data = Dataset(data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) if 'char' in params['label_type']: test_data = Dataset(data_type='test', label_type=params['label_type'], batch_size=1, eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) else: test_data = Dataset(data_type='test', label_type='phone39', batch_size=1, eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # TODO(hirofumi): add frame_stacking and splice # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, att_logits, ctc_logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.att_labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.att_labels_seq_len_pl_list[0], model.ctc_labels_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0]) train_op = model.train(loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) _, decode_op_infer = model.decoder(decoder_outputs_train, decoder_outputs_infer, decode_type='greedy', beam_width=20) ler_op = model.compute_ler(model.att_labels_st_true_pl, model.att_labels_st_pred_pl) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total param parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M param" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, att_labels_train, ctc_labels, inputs_seq_len, att_labels_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs, model.att_labels_pl_list[0]: att_labels_train, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.att_labels_seq_len_pl_list[0]: att_labels_seq_len, model.ctc_labels_pl_list[0]: list2sparsetensor( ctc_labels, padded_value=train_data.ctc_padded_value), model.keep_prob_input_pl_list[0]: params['dropout_input'], model.keep_prob_hidden_pl_list[0]: params['dropout_hidden'], model.keep_prob_output_pl_list[0]: params['dropout_output'], learning_rate_pl: learning_rate } # Update param sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, att_labels_dev, ctc_labels, inputs_seq_len, att_labels_seq_len, _), _ = dev_data().next() feed_dict_dev = { model.inputs_pl_list[0]: inputs, model.att_labels_pl_list[0]: att_labels_dev, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.att_labels_seq_len_pl_list[0]: att_labels_seq_len, model.ctc_labels_pl_list[0]: list2sparsetensor( ctc_labels, padded_value=dev_data.ctc_padded_value), model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_input_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_hidden_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_output_pl_list[0]] = 1.0 # Predict class ids & update event files predicted_ids_train, summary_str_train = sess.run( [decode_op_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev, summary_str_dev = sess.run( [decode_op_infer, summary_dev], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() # Convert to sparsetensor to compute LER feed_dict_ler_train = { model.att_labels_true_st: list2sparsetensor(att_labels_train, padded_value=params['eos_index']), model.att_labels_st_pred_pl: list2sparsetensor(predicted_ids_train, padded_value=params['eos_index']) } feed_dict_ler_dev = { model.att_labels_true_st: list2sparsetensor(att_labels_dev, padded_value=params['eos_index']), model.att_labels_st_pred_pl: list2sparsetensor(predicted_ids_dev, padded_value=params['eos_index']) } # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) duration_step = time.time() - start_time_step print( "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) # sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() if 'char' in params['label_type']: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=dev_data, eval_batch_size=1) print(' CER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=test_data, eval_batch_size=1) print(' CER: %f %%' % (ler_test * 100)) else: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (PER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=test_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=ler_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_train(model, params, gpu_indices): """Run training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ if 'kanji' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '_' + params['train_data_size'] + '.txt' elif 'kana' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_data = Dataset( data_type='dev', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer( params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops_infer, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits, tower_decoder_outputs_train, tower_decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.labels_seq_len_pl_list[i_gpu], model.keep_prob_encoder_pl_list[i_gpu], model.keep_prob_decoder_pl_list[i_gpu], model.keep_prob_embedding_pl_list[i_gpu], scope) tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower _, decode_op_tower_infer = model.decode( tower_decoder_outputs_train, tower_decoder_outputs_infer) decode_ops_infer.append(decode_op_tower_infer) # ler_op_tower = model.compute_ler( # decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = model.compute_ler( model.labels_st_true_pl_list[i_gpu], model.labels_st_pred_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() cer_dev_best = 1 not_improved_epoch = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_train[model.labels_pl_list[i_gpu] ] = labels_train[i_gpu] feed_dict_train[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_train[model.labels_seq_len_pl_list[i_gpu] ] = labels_seq_len[i_gpu] feed_dict_train[model.keep_prob_encoder_pl_list[i_gpu] ] = 1 - float(params['dropout_encoder']) feed_dict_train[model.keep_prob_decoder_pl_list[i_gpu] ] = 1 - float(params['dropout_decoder']) feed_dict_train[model.keep_prob_embedding_pl_list[i_gpu] ] = 1 - float(params['dropout_embedding']) feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int(params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) inputs, labels_dev, inputs_seq_len, labels_seq_len, _ = dev_data.next()[ 0] feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_dev[model.labels_pl_list[i_gpu] ] = labels_dev[i_gpu] feed_dict_dev[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_dev[model.labels_seq_len_pl_list[i_gpu] ] = labels_seq_len[i_gpu] feed_dict_dev[model.keep_prob_encoder_pl_list[i_gpu] ] = 1.0 feed_dict_dev[model.keep_prob_decoder_pl_list[i_gpu] ] = 1.0 feed_dict_dev[model.keep_prob_embedding_pl_list[i_gpu] ] = 1.0 # Compute loss loss_train = sess.run( loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_train[model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_train[model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Predict class ids predicted_ids_train_list, summary_str_train = sess.run( [decode_ops_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev_list, summary_str_dev = sess.run( [decode_ops_infer, summary_dev], feed_dict=feed_dict_dev) # Convert to sparsetensor to compute LER feed_dict_ler_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_train[model.labels_st_true_pl_list[i_gpu]] = list2sparsetensor( labels_train[i_gpu], padded_value=train_data.padded_value), feed_dict_ler_train[model.labels_st_pred_pl_list[i_gpu]] = list2sparsetensor( predicted_ids_train_list[i_gpu], padded_value=train_data.padded_value) feed_dict_ler_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_dev[model.labels_st_true_pl_list[i_gpu]] = list2sparsetensor( labels_dev[i_gpu], padded_value=dev_data.padded_value), feed_dict_ler_dev[model.labels_st_pred_pl_list[i_gpu]] = list2sparsetensor( predicted_ids_dev_list[i_gpu], padded_value=dev_data.padded_value) # Compute accuracy # ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) # ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) ler_train = 1 ler_dev = 1 csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) # TODO: fix this # Update even files summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_ops=decode_ops_infer, model=model, dataset=dev_data, label_type=params['label_type'], train_data_size=params['train_data_size'], eval_batch_size=1) print(' CER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < cer_dev_best: cer_dev_best = cer_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=cer_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_train(model, params): """Run training. Args: model: the model to train params (dict): A dictionary of parameters """ # Load dataset train_data = Dataset( data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True) dev_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train( loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=params['beam_width']) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=False) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize parameters sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() fmean_dev_best = 0 fmean_time_dev_best = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels, inputs_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: list2sparsetensor( labels[0], padded_value=train_data.padded_value), model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1 - float(params['dropout']), learning_rate_pl: learning_rate } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, labels, inputs_seq_len, _), _ = dev_data.next() feed_dict_dev = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: list2sparsetensor( labels[0], padded_value=dev_data.padded_value), model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy & update event files ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') fmean_dev_epoch, df_acc = do_eval_fmeasure( session=sess, decode_op=decode_op, model=model, dataset=dev_data, eval_batch_size=params['batch_size']) print(df_acc) print(' F-measure: %f %%' % (fmean_dev_epoch)) if fmean_dev_epoch > fmean_dev_best: fmean_dev_best = fmean_dev_epoch print('■■■ ↑Best Score (F-measure)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') fmean_test_epoch, df_acc = do_eval_fmeasure( session=sess, decode_op=decode_op, model=model, dataset=test_data, eval_batch_size=params['batch_size']) print(df_acc) print(' F-measure: %f %%' % (fmean_test_epoch)) # fmean_time_dev_epoch, df_acc = do_eval_fmeasure_time( # session=sess, # decode_op=decode_op, # posteriors_op=posteriors_op, # model=model, # dataset=dev_data, # eval_batch_size=params['batch_size']) # print(df_acc) # print(' Time F-measure: %f %%' % # (fmean_time_dev_epoch)) # if fmean_time_dev_best < fmean_time_dev_epoch: # fmean_time_dev_best = fmean_time_dev_epoch # print('■■■ ↑Best Score (Time F-measure)↑ ■■■') # fmean_time_test_epoch, df_acc = do_eval_fmeasure_time( # session=sess, # decode_op=decode_op, # posteriors_op=posteriors_op, # model=model, # dataset=test_data, # eval_batch_size=params['batch_size']) # print(df_acc) # print(' Time F-measure: %f %%' % # (fmean_time_test_epoch)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=fmean_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def main(): args = parser.parse_args() ################################################## # DATSET ################################################## if args.model_save_path is not None: # Load a config file (.yml) params = load_config(args.config_path) # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: params = load_config(os.path.join(args.saved_model_path, 'config.yml')) else: raise ValueError("Set model_save_path or saved_model_path.") # Load dataset train_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='train', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], tool=params['tool'], num_enque=None, dynamic_batching=params['dynamic_batching']) dev_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) dev_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) test_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) test_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) params['num_classes'] = train_data.num_classes ################################################## # MODEL ################################################## # Model setting model = load(model_type=params['model_type'], params=params, backend=params['backend']) if args.model_save_path is not None: # Set save path save_path = mkdir_join(args.model_save_path, params['backend'], params['model_type'], params['label_type'], params['data_size'], model.name) model.set_save_path(save_path) # Save config file save_config(config_path=args.config_path, save_path=model.save_path) # Setting for logging logger = set_logger(model.save_path) if os.path.isdir(params['char_init']): # NOTE: Start training from the pre-trained character model model.load_checkpoint(save_path=params['char_init'], epoch=-1, load_pretrained_model=True) # Count total parameters for name in sorted(list(model.num_params_dict.keys())): num_params = model.num_params_dict[name] logger.info("%s %d" % (name, num_params)) logger.info("Total %.3f M parameters" % (model.total_parameters / 1000000)) # Define optimizer model.set_optimizer(optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) epoch, step = 1, 0 learning_rate = float(params['learning_rate']) metric_dev_best = 1 # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: # Set save path model.save_path = args.saved_model_path # Setting for logging logger = set_logger(model.save_path, restart=True) # Define optimizer model.set_optimizer( optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), # on-the-fly weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) # Restore the last saved model epoch, step, learning_rate, metric_dev_best = model.load_checkpoint( save_path=args.saved_model_path, epoch=-1, restart=True) else: raise ValueError("Set model_save_path or saved_model_path.") train_data.epoch = epoch - 1 # GPU setting model.set_cuda(deterministic=False, benchmark=True) logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) # Set process name setproctitle('libri_' + params['backend'] + '_' + params['model_type'] + '_' + params['label_type'] + '_' + params['data_size']) ################################################## # TRAINING LOOP ################################################## # Define learning rate controller lr_controller = Controller( learning_rate_init=learning_rate, backend=params['backend'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Setting for tensorboard if params['backend'] == 'pytorch': tf_writer = SummaryWriter(model.save_path) # Train model csv_steps, csv_loss_train, csv_loss_dev = [], [], [] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() not_improved_epoch = 0 best_model = model loss_train_mean = 0. pbar_epoch = tqdm(total=len(train_data)) while True: # Compute loss in the training set (including parameter update) batch_train, is_new_epoch = train_data.next() model, loss_train_val = train_step(model, batch_train, params['clip_grad_norm'], backend=params['backend']) loss_train_mean += loss_train_val pbar_epoch.update(len(batch_train['xs'])) if (step + 1) % params['print_step'] == 0: # Compute loss in the dev set batch_dev = dev_clean_data.next()[0] loss_dev = model(batch_dev['xs'], batch_dev['ys'], batch_dev['x_lens'], batch_dev['y_lens'], is_eval=True) loss_train_mean /= params['print_step'] csv_steps.append(step) csv_loss_train.append(loss_train_mean) csv_loss_dev.append(loss_dev) # Logging by tensorboard if params['backend'] == 'pytorch': tf_writer.add_scalar('train/loss', loss_train_mean, step + 1) tf_writer.add_scalar('dev/loss', loss_dev, step + 1) for name, param in model.named_parameters(): name = name.replace('.', '/') tf_writer.add_histogram(name, param.data.cpu().numpy(), step + 1) tf_writer.add_histogram(name + '/grad', param.grad.data.cpu().numpy(), step + 1) duration_step = time.time() - start_time_step logger.info( "...Step:%d(epoch:%.3f) loss:%.3f(%.3f)/lr:%.5f/batch:%d/x_lens:%d (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train_mean, loss_dev, learning_rate, train_data.current_batch_size, max(batch_train['x_lens']) * params['num_stack'], duration_step / 60)) start_time_step = time.time() loss_train_mean = 0. step += 1 # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('===== EPOCH:%d (%.3f min) =====' % (epoch, duration_epoch / 60)) # Save fugure of loss plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) if epoch < params['eval_start_epoch']: # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) else: start_time_eval = time.time() # dev if 'word' in params['label_type']: metric_dev_epoch, _ = do_eval_wer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-clean): %.3f %%' % (metric_dev_epoch * 100)) else: metric_dev_epoch, wer_dev_clean_epoch, _ = do_eval_cer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info(' CER / WER (dev-clean): %.3f %% / %.3f %%' % ((metric_dev_epoch * 100), (wer_dev_clean_epoch * 100))) if metric_dev_epoch < metric_dev_best: metric_dev_best = metric_dev_epoch not_improved_epoch = 0 best_model = copy.deepcopy(model) logger.info('||||| Best Score |||||') # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) # dev-other & test if 'word' in params['label_type']: metric_dev_other_epoch, _ = do_eval_wer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-other): %.3f %%' % (metric_dev_other_epoch * 100)) wer_test_clean, _ = do_eval_wer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-clean): %.3f %%' % (wer_test_clean * 100)) wer_test_other, _ = do_eval_wer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-other): %.3f %%' % (wer_test_other * 100)) logger.info( ' WER (test-mean): %.3f %%' % ((wer_test_clean + wer_test_other) * 100 / 2)) else: metric_dev_other_epoch, wer_dev_other_epoch, _ = do_eval_cer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (dev-other): %.3f %% / %.3f %%' % ((metric_dev_other_epoch * 100), (wer_dev_other_epoch * 100))) cer_test_clean, wer_test_clean, _ = do_eval_cer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-clean): %.3f %% / %.3f %%' % ((cer_test_clean * 100), (wer_test_clean * 100))) cer_test_other, wer_test_other, _ = do_eval_cer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-other): %.3f %% / %.3f %%' % ((cer_test_other * 100), (wer_test_other * 100))) logger.info( ' CER / WER (test-mean): %.3f %% / %.3f %%' % (((cer_test_clean + cer_test_other) * 100 / 2), ((wer_test_clean + wer_test_other) * 100 / 2))) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate model.optimizer, learning_rate = lr_controller.decay_lr( optimizer=model.optimizer, learning_rate=learning_rate, epoch=epoch, value=metric_dev_epoch) if epoch == params['convert_to_sgd_epoch']: # Convert to fine-tuning stage model.set_optimizer( 'sgd', learning_rate_init=learning_rate, weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) logger.info('========== Convert to SGD ==========') # Inject Gaussian noise to all parameters if float(params['weight_noise_std']) > 0: model.weight_noise_injection = True pbar_epoch = tqdm(total=len(train_data)) print('========== EPOCH:%d (%.3f min) ==========' % (epoch, duration_epoch / 60)) if epoch == params['num_epoch']: break start_time_step = time.time() start_time_epoch = time.time() epoch += 1 # TODO: evaluate the best model by beam search here duration_train = time.time() - start_time_train logger.info('Total time: %.3f hour' % (duration_train / 3600)) if params['backend'] == 'pytorch': tf_writer.close() # Training was finished correctly with open(os.path.join(model.save_path, 'COMPLETE'), 'w') as f: f.write('')
def do_train(model, params, gpu_indices): """Run training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ if 'kanji' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '_' + params['train_data_size'] + '.txt' elif 'kana' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset(data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_data = Dataset(data_type='dev', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops_infer, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits, tower_decoder_outputs_train, tower_decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.labels_seq_len_pl_list[i_gpu], model.keep_prob_encoder_pl_list[i_gpu], model.keep_prob_decoder_pl_list[i_gpu], model.keep_prob_embedding_pl_list[i_gpu], scope) tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower _, decode_op_tower_infer = model.decode( tower_decoder_outputs_train, tower_decoder_outputs_infer) decode_ops_infer.append(decode_op_tower_infer) # ler_op_tower = model.compute_ler( # decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = model.compute_ler( model.labels_st_true_pl_list[i_gpu], model.labels_st_pred_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() cer_dev_best = 1 not_improved_epoch = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_train[ model.labels_pl_list[i_gpu]] = labels_train[i_gpu] feed_dict_train[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_train[model.labels_seq_len_pl_list[ i_gpu]] = labels_seq_len[i_gpu] feed_dict_train[ model.keep_prob_encoder_pl_list[i_gpu]] = 1 - float( params['dropout_encoder']) feed_dict_train[ model.keep_prob_decoder_pl_list[i_gpu]] = 1 - float( params['dropout_decoder']) feed_dict_train[ model.keep_prob_embedding_pl_list[i_gpu]] = 1 - float( params['dropout_embedding']) feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int( params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) inputs, labels_dev, inputs_seq_len, labels_seq_len, _ = dev_data.next( )[0] feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_dev[ model.labels_pl_list[i_gpu]] = labels_dev[i_gpu] feed_dict_dev[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_dev[model.labels_seq_len_pl_list[ i_gpu]] = labels_seq_len[i_gpu] feed_dict_dev[ model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Predict class ids predicted_ids_train_list, summary_str_train = sess.run( [decode_ops_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev_list, summary_str_dev = sess.run( [decode_ops_infer, summary_dev], feed_dict=feed_dict_dev) # Convert to sparsetensor to compute LER feed_dict_ler_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_train[model.labels_st_true_pl_list[ i_gpu]] = list2sparsetensor( labels_train[i_gpu], padded_value=train_data.padded_value), feed_dict_ler_train[model.labels_st_pred_pl_list[ i_gpu]] = list2sparsetensor( predicted_ids_train_list[i_gpu], padded_value=train_data.padded_value) feed_dict_ler_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_dev[model.labels_st_true_pl_list[ i_gpu]] = list2sparsetensor( labels_dev[i_gpu], padded_value=dev_data.padded_value), feed_dict_ler_dev[model.labels_st_pred_pl_list[ i_gpu]] = list2sparsetensor( predicted_ids_dev_list[i_gpu], padded_value=dev_data.padded_value) # Compute accuracy # ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) # ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) ler_train = 1 ler_dev = 1 csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) # TODO: fix this # Update even files summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_ops=decode_ops_infer, model=model, dataset=dev_data, label_type=params['label_type'], train_data_size=params['train_data_size'], eval_batch_size=1) print(' CER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < cer_dev_best: cer_dev_best = cer_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params[ 'not_improved_patient_epoch']: break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=cer_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_train(model, params): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: model: the model to train params (dict): A dictionary of parameters """ map_file_path_train = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' if 'phone' in params['label_type']: map_file_path_eval = '../metrics/mapping_files/phone39.txt' else: map_file_path_eval = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset( data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], map_file_path=map_file_path_train, max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch']) dev_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], map_file_path=map_file_path_train, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) if 'char' in params['label_type']: test_data = Dataset( data_type='test', label_type=params['label_type'], batch_size=1, map_file_path=map_file_path_eval, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) else: test_data = Dataset( data_type='test', label_type='phone39', batch_size=1, map_file_path=map_file_path_eval, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.labels_seq_len_pl_list[0], model.keep_prob_encoder_pl_list[0], model.keep_prob_decoder_pl_list[0], model.keep_prob_embedding_pl_list[0]) train_op = model.train(loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) _, decode_op_infer = model.decode( decoder_outputs_train, decoder_outputs_infer) ler_op = model.compute_ler(model.labels_st_true_pl, model.labels_st_pred_pl) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total param parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M param" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: labels_train[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.labels_seq_len_pl_list[0]: labels_seq_len[0], model.keep_prob_encoder_pl_list[0]: 1 - float(params['dropout_encoder']), model.keep_prob_decoder_pl_list[0]: 1 - float(params['dropout_decoder']), model.keep_prob_embedding_pl_list[0]: 1 - float(params['dropout_embedding']), learning_rate_pl: learning_rate } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, labels_dev, inputs_seq_len, labels_seq_len, _), _ = dev_data.next() feed_dict_dev = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: labels_dev[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.labels_seq_len_pl_list[0]: labels_seq_len[0], model.keep_prob_encoder_pl_list[0]: 1.0, model.keep_prob_decoder_pl_list[0]: 1.0, model.keep_prob_embedding_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_encoder_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_decoder_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_embedding_pl_list[0]] = 1.0 # Predict class ids & update even files predicted_ids_train, summary_str_train = sess.run( [decode_op_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev, summary_str_dev = sess.run( [decode_op_infer, summary_dev], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() # Convert to sparsetensor to compute LER feed_dict_ler_train = { model.labels_st_true_pl: list2sparsetensor( labels_train[0], padded_value=train_data.padded_value), model.labels_st_pred_pl: list2sparsetensor( predicted_ids_train, padded_value=train_data.padded_value) } feed_dict_ler_dev = { model.labels_st_true_pl: list2sparsetensor( labels_dev[0], padded_value=dev_data.padded_value), model.labels_st_pred_pl: list2sparsetensor( predicted_ids_dev, padded_value=dev_data.padded_value) } # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() if 'char' in params['label_type']: print('=== Dev Data Evaluation ===') ler_dev_epoch, wer_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' CER: %f %%' % (ler_dev_epoch * 100)) print(' WER: %f %%' % (wer_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test, wer_test = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=test_data, label_type=params['label_type'], is_test=True, eval_batch_size=1) print(' CER: %f %%' % (ler_test * 100)) print(' WER: %f %%' % (wer_test * 100)) else: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (PER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=test_data, label_type=params['label_type'], is_test=True, eval_batch_size=1) print(' PER: %f %%' % (ler_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=ler_dev_epoch) start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')