def check_training(self): print('----- multitask -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels_true_char_st, labels_true_phone_st, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define placeholders inputs_pl = tf.placeholder(tf.float32, shape=[None, None, inputs.shape[-1]], name='input') indices_pl = tf.placeholder(tf.int64, name='indices') values_pl = tf.placeholder(tf.int32, name='values') shape_pl = tf.placeholder(tf.int64, name='shape') labels_pl = tf.SparseTensor(indices_pl, values_pl, shape_pl) indices_second_pl = tf.placeholder(tf.int64, name='indices_second') values_second_pl = tf.placeholder(tf.int32, name='values_second') shape_second_pl = tf.placeholder(tf.int64, name='shape_second') labels_second_pl = tf.SparseTensor(indices_second_pl, values_second_pl, shape_second_pl) inputs_seq_len_pl = tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len') keep_prob_input_pl = tf.placeholder(tf.float32, name='keep_prob_input') keep_prob_hidden_pl = tf.placeholder(tf.float32, name='keep_prob_hidden') # Define model graph output_size_main = 26 output_size_second = 61 network = Multitask_BLSTM_CTC( batch_size=batch_size, input_size=inputs[0].shape[1], num_unit=256, num_layer_main=2, num_layer_second=1, output_size_main=output_size_main, output_size_second=output_size_second, main_task_weight=0.8, parameter_init=0.1, clip_grad=5.0, clip_activation=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, num_proj=None, weight_decay=1e-6) # Add to the graph each operation loss_op, logits_main, logits_second = network.compute_loss( inputs_pl, labels_pl, labels_second_pl, inputs_seq_len_pl, keep_prob_input_pl, keep_prob_hidden_pl) learning_rate = 1e-3 train_op = network.train(loss_op, optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_main, decode_op_second = network.decoder( logits_main, logits_second, inputs_seq_len_pl, decode_type='beam_search', beam_width=20) ler_op_main, ler_op_second = network.compute_ler( decode_op_main, decode_op_second, labels_pl, labels_second_pl) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { inputs_pl: inputs, labels_pl: labels_true_char_st, labels_second_pl: labels_true_phone_st, inputs_seq_len_pl: inputs_seq_len, keep_prob_input_pl: network.dropout_ratio_input, keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr: learning_rate } with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_char_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[keep_prob_input_pl] = 1.0 feed_dict[keep_prob_hidden_pl] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_second], feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / cer = %.4f / per = %.4f (%.3f sec)\n' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_second], feed_dict=feed_dict) labels_true_char = sparsetensor2list( labels_true_char_st, batch_size=batch_size) labels_true_phone = sparsetensor2list( labels_true_phone_st, batch_size=batch_size) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) # character print('Character') print(' True: %s' % num2alpha(labels_true_char[0])) print(' Pred: %s' % num2alpha(labels_pred_char[0])) print('Phone') print(' True: %s' % num2phone(labels_true_phone[0])) print(' Pred: %s' % num2phone(labels_pred_phone[0])) print('----------------------------------------') if ler_train_char >= ler_train_char_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 5: print('Modle is Converged.') break ler_train_char_pre = ler_train_char # Change to training mode network.is_training = True duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def check_training(self, model_type, label_type): print('----- ' + model_type + ', ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, inputs_seq_len, labels_seq_len = generate_data( label_type=label_type, model='attention', batch_size=batch_size) # Define placeholders inputs_pl = tf.placeholder( tf.float32, shape=[batch_size, None, inputs.shape[-1]], name='input') # `[batch_size, max_time]` labels_pl = tf.placeholder(tf.int32, shape=[None, None], name='label') # These are prepared for computing LER indices_true_pl = tf.placeholder(tf.int64, name='indices') values_true_pl = tf.placeholder(tf.int32, name='values') shape_true_pl = tf.placeholder(tf.int64, name='shape') labels_st_true_pl = tf.SparseTensor(indices_true_pl, values_true_pl, shape_true_pl) indices_pred_pl = tf.placeholder(tf.int64, name='indices') values_pred_pl = tf.placeholder(tf.int32, name='values') shape_pred_pl = tf.placeholder(tf.int64, name='shape') labels_st_pred_pl = tf.SparseTensor(indices_pred_pl, values_pred_pl, shape_pred_pl) inputs_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='inputs_seq_len') labels_seq_len_pl = tf.placeholder(tf.int32, shape=[None], name='labels_seq_len') keep_prob_input_pl = tf.placeholder(tf.float32, name='keep_prob_input') keep_prob_hidden_pl = tf.placeholder(tf.float32, name='keep_prob_hidden') # Define model graph output_size = 26 + 2 if label_type == 'character' else 61 + 2 # model = load(model_type=model_type) network = BLSTMAttetion(batch_size=batch_size, input_size=inputs[0].shape[1], encoder_num_unit=256, encoder_num_layer=2, attention_dim=128, decoder_num_unit=256, decoder_num_layer=1, embedding_dim=20, output_size=output_size, sos_index=output_size - 2, eos_index=output_size - 1, max_decode_length=50, attention_weights_tempareture=1, logits_tempareture=1, parameter_init=0.1, clip_grad=5.0, clip_activation_encoder=50, clip_activation_decoder=50, dropout_ratio_input=1.0, dropout_ratio_hidden=1.0, weight_decay=0, beam_width=0, time_major=False) # Add to the graph each operation loss_op, logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss( inputs_pl, labels_pl, inputs_seq_len_pl, labels_seq_len_pl, keep_prob_input_pl, keep_prob_hidden_pl) learning_rate = 1e-3 train_op = network.train(loss_op, optimizer='rmsprop', learning_rate_init=learning_rate, is_scheduled=False) decode_op_train, decode_op_infer = network.decoder( decoder_outputs_train, decoder_outputs_infer, decode_type='greedy', beam_width=1) ler_op = network.compute_ler(labels_st_true_pl, labels_st_pred_pl) attention_weights = decoder_outputs_infer.attention_scores # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { inputs_pl: inputs, labels_pl: labels, inputs_seq_len_pl: inputs_seq_len, labels_seq_len_pl: labels_seq_len, keep_prob_input_pl: network.dropout_ratio_input, keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr: learning_rate } with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[keep_prob_input_pl] = 1.0 feed_dict[keep_prob_hidden_pl] = 1.0 # Predict class ids predicted_ids_train, predicted_ids_infer = sess.run( [decode_op_train, decode_op_infer], feed_dict=feed_dict) # Compute accuracy feed_dict_ler = { labels_st_true_pl: list2sparsetensor(labels), labels_st_pred_pl: list2sparsetensor(predicted_ids_infer) } ler_train = sess.run(ler_op, feed_dict=feed_dict_ler) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' % (step + 1, loss_train, ler_train, duration_step)) start_time_step = time.time() # Visualize if label_type == 'character': print('True : %s' % num2alpha(labels[0])) print('Pred (Training) : <%s' % num2alpha(predicted_ids_train[0])) print('Pred (Inference): <%s' % num2alpha(predicted_ids_infer[0])) else: print('True : %s' % num2phone(labels[0])) print('Pred (Training) : < %s' % num2phone(predicted_ids_train[0])) print('Pred (Inference): < %s' % num2phone(predicted_ids_infer[0])) if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 5: print('Model is Converged.') break ler_train_pre = ler_train duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def do_train(network, param): """Run training. Args: network: network to train param: A dictionary of parameters """ # Load dataset train_data = Dataset(data_type='train', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=True) dev_data_step = Dataset(data_type='dev', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=False) dev_data_epoch = Dataset(data_type='dev', label_type=param['label_type'], train_data_size=param['train_data_size'], batch_size=param['batch_size'], num_stack=param['num_stack'], num_skip=param['num_skip'], is_sorted=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders network.inputs = tf.placeholder(tf.float32, shape=[None, None, network.input_size], name='input') indices_pl = tf.placeholder(tf.int64, name='indices') values_pl = tf.placeholder(tf.int32, name='values') shape_pl = tf.placeholder(tf.int64, name='shape') network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl) network.inputs_seq_len = tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len') network.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') network.keep_prob_hidden = tf.placeholder(tf.float32, name='keep_prob_hidden') # Add to the graph each operation (including model definition) loss_op, logits = network.compute_loss(network.inputs, network.labels, network.inputs_seq_len, network.keep_prob_input, network.keep_prob_hidden) train_op = network.train(loss_op, optimizer=param['optimizer'], learning_rate_init=float( param['learning_rate']), is_scheduled=False) decode_op = network.decoder(logits, network.inputs_seq_len, decode_type='beam_search', beam_width=20) ler_op = network.compute_ler(decode_op, network.labels) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_train_loss, csv_dev_loss = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Make mini-batch generator mini_batch_train = train_data.next_batch() mini_batch_dev = dev_data_step.next_batch() # Train model iter_per_epoch = int(train_data.data_num / param['batch_size']) train_step = train_data.data_num / param['batch_size'] if (train_step) != int(train_step): iter_per_epoch += 1 max_steps = iter_per_epoch * param['num_epoch'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) with tf.device('/cpu:0'): inputs, labels, inputs_seq_len, _ = mini_batch_train.__next__( ) feed_dict_train = { network.inputs: inputs, network.labels: list2sparsetensor(labels, padded_value=-1), network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.lr: float(param['learning_rate']) } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % 200 == 0: # Create feed dictionary for next mini batch (dev) with tf.device('/cpu:0'): inputs, labels, inputs_seq_len, _ = mini_batch_dev.__next__( ) feed_dict_dev = { network.inputs: inputs, network.labels: list2sparsetensor(labels, padded_value=-1), network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden } # Compute loss_ loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_train_loss.append(loss_train) csv_dev_loss.append(loss_dev) # Change to evaluation mode feed_dict_train[network.keep_prob_input] = 1.0 feed_dict_train[network.keep_prob_hidden] = 1.0 feed_dict_dev[network.keep_prob_input] = 1.0 feed_dict_dev[network.keep_prob_hidden] = 1.0 # Compute accuracy & update event file ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)' % (step + 1, loss_train, loss_dev, ler_train, ler_dev, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') save_path = saver.save(sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) if epoch >= 5: start_time_eval = time.time() print('=== Dev Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op, network=network, dataset=dev_data_epoch, label_type=param['label_type'], eval_batch_size=param['batch_size']) if param['label_type'] in ['kana', 'kanji']: print(' CER: %f %%' % (cer_dev_epoch * 100)) else: print(' PER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < error_best: error_best = cer_dev_epoch print('■■■ ↑Best Score↑ ■■■') duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss, ler save_loss(csv_steps, csv_train_loss, csv_dev_loss, save_path=network.model_dir) save_ler(csv_steps, csv_ler_train, csv_ler_dev, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')