def do_train(network, param):
    """Run training.
    Args:
        network: network to train
        param: A dictionary of parameters
    """
    # Load dataset
    train_data = Dataset(data_type='train',
                         label_type=param['label_type'],
                         train_data_size=param['train_data_size'],
                         batch_size=param['batch_size'],
                         num_stack=param['num_stack'],
                         num_skip=param['num_skip'],
                         is_sorted=True)
    dev_data_step = Dataset(data_type='dev',
                            label_type=param['label_type'],
                            train_data_size=param['train_data_size'],
                            batch_size=param['batch_size'],
                            num_stack=param['num_stack'],
                            num_skip=param['num_skip'],
                            is_sorted=False)
    dev_data_epoch = Dataset(data_type='dev',
                             label_type=param['label_type'],
                             train_data_size=param['train_data_size'],
                             batch_size=param['batch_size'],
                             num_stack=param['num_stack'],
                             num_skip=param['num_skip'],
                             is_sorted=False)

    # Tell TensorFlow that the model will be built into the default graph
    with tf.Graph().as_default():

        # Define placeholders
        network.inputs = tf.placeholder(tf.float32,
                                        shape=[None, None, network.input_size],
                                        name='input')
        indices_pl = tf.placeholder(tf.int64, name='indices')
        values_pl = tf.placeholder(tf.int32, name='values')
        shape_pl = tf.placeholder(tf.int64, name='shape')
        network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl)
        network.inputs_seq_len = tf.placeholder(tf.int64,
                                                shape=[None],
                                                name='inputs_seq_len')
        network.keep_prob_input = tf.placeholder(tf.float32,
                                                 name='keep_prob_input')
        network.keep_prob_hidden = tf.placeholder(tf.float32,
                                                  name='keep_prob_hidden')

        # Add to the graph each operation (including model definition)
        loss_op, logits = network.compute_loss(network.inputs, network.labels,
                                               network.inputs_seq_len,
                                               network.keep_prob_input,
                                               network.keep_prob_hidden)
        train_op = network.train(loss_op,
                                 optimizer=param['optimizer'],
                                 learning_rate_init=float(
                                     param['learning_rate']),
                                 is_scheduled=False)
        decode_op = network.decoder(logits,
                                    network.inputs_seq_len,
                                    decode_type='beam_search',
                                    beam_width=20)
        ler_op = network.compute_ler(decode_op, network.labels)

        # Build the summary tensor based on the TensorFlow collection of
        # summaries
        summary_train = tf.summary.merge(network.summaries_train)
        summary_dev = tf.summary.merge(network.summaries_dev)

        # Add the variable initializer operation
        init_op = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        # Count total parameters
        parameters_dict, total_parameters = count_total_parameters(
            tf.trainable_variables())
        for parameter_name in sorted(parameters_dict.keys()):
            print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
        print("Total %d variables, %s M parameters" %
              (len(parameters_dict.keys()), "{:,}".format(
                  total_parameters / 1000000)))

        csv_steps, csv_train_loss, csv_dev_loss = [], [], []
        csv_ler_train, csv_ler_dev = [], []
        # Create a session for running operation on the graph
        with tf.Session() as sess:

            # Instantiate a SummaryWriter to output summaries and the graph
            summary_writer = tf.summary.FileWriter(network.model_dir,
                                                   sess.graph)

            # Initialize parameters
            sess.run(init_op)

            # Make mini-batch generator
            mini_batch_train = train_data.next_batch()
            mini_batch_dev = dev_data_step.next_batch()

            # Train model
            iter_per_epoch = int(train_data.data_num / param['batch_size'])
            train_step = train_data.data_num / param['batch_size']
            if (train_step) != int(train_step):
                iter_per_epoch += 1
            max_steps = iter_per_epoch * param['num_epoch']
            start_time_train = time.time()
            start_time_epoch = time.time()
            start_time_step = time.time()
            error_best = 1
            for step in range(max_steps):

                # Create feed dictionary for next mini batch (train)
                with tf.device('/cpu:0'):
                    inputs, labels, inputs_seq_len, _ = mini_batch_train.__next__(
                    )
                feed_dict_train = {
                    network.inputs: inputs,
                    network.labels: list2sparsetensor(labels, padded_value=-1),
                    network.inputs_seq_len: inputs_seq_len,
                    network.keep_prob_input: network.dropout_ratio_input,
                    network.keep_prob_hidden: network.dropout_ratio_hidden,
                    network.lr: float(param['learning_rate'])
                }

                # Update parameters
                sess.run(train_op, feed_dict=feed_dict_train)

                if (step + 1) % 200 == 0:

                    # Create feed dictionary for next mini batch (dev)
                    with tf.device('/cpu:0'):
                        inputs, labels, inputs_seq_len, _ = mini_batch_dev.__next__(
                        )
                    feed_dict_dev = {
                        network.inputs: inputs,
                        network.labels: list2sparsetensor(labels,
                                                          padded_value=-1),
                        network.inputs_seq_len: inputs_seq_len,
                        network.keep_prob_input: network.dropout_ratio_input,
                        network.keep_prob_hidden: network.dropout_ratio_hidden
                    }

                    # Compute loss_
                    loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
                    loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
                    csv_steps.append(step)
                    csv_train_loss.append(loss_train)
                    csv_dev_loss.append(loss_dev)

                    # Change to evaluation mode
                    feed_dict_train[network.keep_prob_input] = 1.0
                    feed_dict_train[network.keep_prob_hidden] = 1.0
                    feed_dict_dev[network.keep_prob_input] = 1.0
                    feed_dict_dev[network.keep_prob_hidden] = 1.0

                    # Compute accuracy & update event file
                    ler_train, summary_str_train = sess.run(
                        [ler_op, summary_train], feed_dict=feed_dict_train)
                    ler_dev, summary_str_dev = sess.run(
                        [ler_op, summary_dev], feed_dict=feed_dict_dev)
                    csv_ler_train.append(ler_train)
                    csv_ler_dev.append(ler_dev)
                    summary_writer.add_summary(summary_str_train, step + 1)
                    summary_writer.add_summary(summary_str_dev, step + 1)
                    summary_writer.flush()

                    duration_step = time.time() - start_time_step
                    print(
                        'Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)'
                        % (step + 1, loss_train, loss_dev, ler_train, ler_dev,
                           duration_step / 60))
                    sys.stdout.flush()
                    start_time_step = time.time()

                # Save checkpoint and evaluate model per epoch
                if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps:
                    duration_epoch = time.time() - start_time_epoch
                    epoch = (step + 1) // iter_per_epoch
                    print('-----EPOCH:%d (%.3f min)-----' %
                          (epoch, duration_epoch / 60))

                    # Save model (check point)
                    checkpoint_file = join(network.model_dir, 'model.ckpt')
                    save_path = saver.save(sess,
                                           checkpoint_file,
                                           global_step=epoch)
                    print("Model saved in file: %s" % save_path)

                    if epoch >= 5:
                        start_time_eval = time.time()
                        print('=== Dev Evaluation ===')
                        cer_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_op=decode_op,
                            network=network,
                            dataset=dev_data_epoch,
                            label_type=param['label_type'],
                            eval_batch_size=param['batch_size'])
                        if param['label_type'] in ['kana', 'kanji']:
                            print('  CER: %f %%' % (cer_dev_epoch * 100))
                        else:
                            print('  PER: %f %%' % (cer_dev_epoch * 100))

                            if cer_dev_epoch < error_best:
                                error_best = cer_dev_epoch
                                print('■■■ ↑Best Score↑ ■■■')

                        duration_eval = time.time() - start_time_eval
                        print('Evaluation time: %.3f min' %
                              (duration_eval / 60))

                        start_time_epoch = time.time()
                        start_time_step = time.time()

            duration_train = time.time() - start_time_train
            print('Total time: %.3f hour' % (duration_train / 3600))

            # Save train & dev loss, ler
            save_loss(csv_steps,
                      csv_train_loss,
                      csv_dev_loss,
                      save_path=network.model_dir)
            save_ler(csv_steps,
                     csv_ler_train,
                     csv_ler_dev,
                     save_path=network.model_dir)

            # Training was finished correctly
            with open(join(network.model_dir, 'complete.txt'), 'w') as f:
                f.write('')
Exemplo n.º 2
0
def do_eval(network, param, epoch=None):
    """Evaluate the model.
    Args:
        network: model to restore
        param: A dictionary of parameters
        epoch: int, the epoch to restore
    """
    # Load dataset
    eval1_data = Dataset(data_type='eval1',
                         label_type=param['label_type'],
                         batch_size=1,
                         train_data_size=param['train_data_size'],
                         num_stack=param['num_stack'],
                         num_skip=param['num_skip'],
                         is_sorted=False,
                         is_progressbar=True)
    eval2_data = Dataset(data_type='eval2',
                         label_type=param['label_type'],
                         batch_size=1,
                         train_data_size=param['train_data_size'],
                         num_stack=param['num_stack'],
                         num_skip=param['num_skip'],
                         is_sorted=False,
                         is_progressbar=True)
    eval3_data = Dataset(data_type='eval3',
                         label_type=param['label_type'],
                         batch_size=1,
                         train_data_size=param['train_data_size'],
                         num_stack=param['num_stack'],
                         num_skip=param['num_skip'],
                         is_sorted=False,
                         is_progressbar=True)

    # Define placeholders
    network.inputs = tf.placeholder(tf.float32,
                                    shape=[None, None, network.input_size],
                                    name='input')
    indices_pl = tf.placeholder(tf.int64, name='indices')
    values_pl = tf.placeholder(tf.int32, name='values')
    shape_pl = tf.placeholder(tf.int64, name='shape')
    network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl)
    network.inputs_seq_len = tf.placeholder(tf.int64,
                                            shape=[None],
                                            name='inputs_seq_len')
    network.keep_prob_input = tf.placeholder(tf.float32,
                                             name='keep_prob_input')
    network.keep_prob_hidden = tf.placeholder(tf.float32,
                                              name='keep_prob_hidden')

    # Add to the graph each operation (including model definition)
    _, logits = network.compute_loss(network.inputs, network.labels,
                                     network.inputs_seq_len,
                                     network.keep_prob_input,
                                     network.keep_prob_hidden)
    decode_op = network.decoder(logits,
                                network.inputs_seq_len,
                                decode_type='beam_search',
                                beam_width=20)
    per_op = network.compute_ler(decode_op, network.labels)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(network.model_dir)

        # If check point exists
        if ckpt:
            # Use last saved model
            model_path = ckpt.model_checkpoint_path
            if epoch is not None:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        if param['label_type'] in ['kana', 'kanji']:
            print('=== eval1 Evaluation ===')
            cer_eval1 = do_eval_cer(session=sess,
                                    decode_op=decode_op,
                                    network=network,
                                    dataset=eval1_data,
                                    label_type=param['label_type'],
                                    is_test=True,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  CER: %f %%' % (cer_eval1 * 100))

            print('=== eval2 Evaluation ===')
            cer_eval2 = do_eval_cer(session=sess,
                                    decode_op=decode_op,
                                    network=network,
                                    dataset=eval2_data,
                                    label_type=param['label_type'],
                                    is_test=True,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  CER: %f %%' % (cer_eval2 * 100))

            print('=== eval3 Evaluation ===')
            cer_eval3 = do_eval_cer(session=sess,
                                    decode_op=decode_op,
                                    network=network,
                                    dataset=eval3_data,
                                    label_type=param['label_type'],
                                    is_test=True,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  CER: %f %%' % (cer_eval3 * 100))

            print('=== Mean ===')
            cer_mean = (cer_eval1 + cer_eval2 + cer_eval3) / 3.
            print('  CER: %f %%' % (cer_mean * 100))

        else:
            print('=== eval1 Evaluation ===')
            per_eval1 = do_eval_per(session=sess,
                                    per_op=per_op,
                                    network=network,
                                    dataset=eval1_data,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  PER: %f %%' % (per_eval1 * 100))

            print('=== eval2 Evaluation ===')
            per_eval2 = do_eval_per(session=sess,
                                    per_op=per_op,
                                    network=network,
                                    dataset=eval2_data,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  PER: %f %%' % (per_eval2 * 100))

            print('=== eval3 Evaluation ===')
            per_eval3 = do_eval_per(session=sess,
                                    per_op=per_op,
                                    network=network,
                                    dataset=eval3_data,
                                    eval_batch_size=1,
                                    is_progressbar=True)
            print('  PER: %f %%' % (per_eval3 * 100))

            print('=== Mean ===')
            per_mean = (per_eval1 + per_eval2 + per_eval3) / 3.
            print('  PER: %f %%' % 1 (per_mean * 100))
def do_eval(model, params, epoch, eval_batch_size, beam_width):
    """Evaluate the model.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch when evaluation
        beam_width (int): beam_width (int, optional): beam width for beam search.
            1 disables beam search, which mean greedy decoding.
    """
    # Load dataset
    eval1_data = Dataset(
        data_type='eval1', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False)
    eval2_data = Dataset(
        data_type='eval2', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False)
    eval3_data = Dataset(
        data_type='eval3', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False)

    with tf.name_scope('tower_gpu0'):
        # Define placeholders
        model.create_placeholders()

        # Add to the graph each operation (including model definition)
        _, logits = model.compute_loss(model.inputs_pl_list[0],
                                       model.labels_pl_list[0],
                                       model.inputs_seq_len_pl_list[0],
                                       model.keep_prob_pl_list[0])
        decode_op = model.decoder(logits,
                                  model.inputs_seq_len_pl_list[0],
                                  beam_width=beam_width)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        print('Test Data Evaluation:')
        cer_eval1 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval1_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval1): %f %%' % (cer_eval1 * 100))

        cer_eval2 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval2_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval2): %f %%' % (cer_eval2 * 100))

        cer_eval3 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval3_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval3): %f %%' % (cer_eval3 * 100))

        cer_mean = (cer_eval1 + cer_eval2 + cer_eval3) / 3.
        print('  CER (mean): %f %%' % (cer_mean * 100))
def do_eval(model, params, epoch, eval_batch_size, beam_width):
    """Evaluate the model.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch when evaluation
        beam_width (int): beam_width (int, optional): beam width for beam search.
            1 disables beam search, which mean greedy decoding.
    """
    # Load dataset
    eval1_data = Dataset(data_type='eval1',
                         train_data_size=params['train_data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size']
                         if eval_batch_size == -1 else eval_batch_size,
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         shuffle=False)
    eval2_data = Dataset(data_type='eval2',
                         train_data_size=params['train_data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size']
                         if eval_batch_size == -1 else eval_batch_size,
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         shuffle=False)
    eval3_data = Dataset(data_type='eval3',
                         train_data_size=params['train_data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size']
                         if eval_batch_size == -1 else eval_batch_size,
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         shuffle=False)

    with tf.name_scope('tower_gpu0'):
        # Define placeholders
        model.create_placeholders()

        # Add to the graph each operation (including model definition)
        _, logits = model.compute_loss(model.inputs_pl_list[0],
                                       model.labels_pl_list[0],
                                       model.inputs_seq_len_pl_list[0],
                                       model.keep_prob_pl_list[0])
        decode_op = model.decoder(logits,
                                  model.inputs_seq_len_pl_list[0],
                                  beam_width=beam_width)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        print('Test Data Evaluation:')
        cer_eval1 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval1_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval1): %f %%' % (cer_eval1 * 100))

        cer_eval2 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval2_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval2): %f %%' % (cer_eval2 * 100))

        cer_eval3 = do_eval_cer(
            session=sess,
            decode_ops=[decode_op],
            model=model,
            dataset=eval3_data,
            label_type=params['label_type'],
            train_data_size=params['train_data_size'],
            is_test=True,
            # eval_batch_size=eval_batch_size,
            progressbar=True)
        print('  CER (eval3): %f %%' % (cer_eval3 * 100))

        cer_mean = (cer_eval1 + cer_eval2 + cer_eval3) / 3.
        print('  CER (mean): %f %%' % (cer_mean * 100))
def do_train(model, params, gpu_indices):
    """Run training.
    Args:
        model: the model to train
        params (dict): A dictionary of parameters
        gpu_indices (list): GPU indices
    """
    # Load dataset
    train_data = Dataset(
        data_type='train', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'], max_epoch=params['num_epoch'],
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'],
        num_gpu=len(gpu_indices))
    dev_data = Dataset(
        data_type='dev', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'], splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=False, num_gpu=len(gpu_indices))

    # Tell TensorFlow that the model will be built into the default graph
    with tf.Graph().as_default(), tf.device('/cpu:0'):

        # Create a variable to track the global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set optimizer
        learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')
        optimizer = model._set_optimizer(
            params['optimizer'], learning_rate_pl)

        # Calculate the gradients for each model tower
        total_grads_and_vars, total_losses = [], []
        decode_ops, ler_ops = [], []
        all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))]
        # NOTE: /cpu:0 is prepared for evaluation
        with tf.variable_scope(tf.get_variable_scope()):
            for i_gpu in range(len(all_devices)):
                with tf.device(all_devices[i_gpu]):
                    with tf.name_scope('tower_gpu%d' % i_gpu) as scope:

                        # Define placeholders in each tower
                        model.create_placeholders()

                        # Calculate the total loss for the current tower of the
                        # model. This function constructs the entire model but
                        # shares the variables across all towers.
                        tower_loss, tower_logits = model.compute_loss(
                            model.inputs_pl_list[i_gpu],
                            model.labels_pl_list[i_gpu],
                            model.inputs_seq_len_pl_list[i_gpu],
                            model.keep_prob_pl_list[i_gpu],
                            scope)
                        tower_loss = tf.expand_dims(tower_loss, axis=0)
                        total_losses.append(tower_loss)

                        # Reuse variables for the next tower
                        tf.get_variable_scope().reuse_variables()

                        # Calculate the gradients for the batch of data on this
                        # tower
                        tower_grads_and_vars = optimizer.compute_gradients(
                            tower_loss)

                        # Gradient clipping
                        tower_grads_and_vars = model._clip_gradients(
                            tower_grads_and_vars)

                        # TODO: Optionally add gradient noise

                        # Keep track of the gradients across all towers
                        total_grads_and_vars.append(tower_grads_and_vars)

                        # Add to the graph each operation per tower
                        decode_op_tower = model.decoder(
                            tower_logits,
                            model.inputs_seq_len_pl_list[i_gpu],
                            beam_width=params['beam_width'])
                        decode_ops.append(decode_op_tower)
                        ler_op_tower = model.compute_ler(
                            decode_op_tower, model.labels_pl_list[i_gpu])
                        ler_op_tower = tf.expand_dims(ler_op_tower, axis=0)
                        ler_ops.append(ler_op_tower)

        # Aggregate losses, then calculate average loss
        total_losses = tf.concat(axis=0, values=total_losses)
        loss_op = tf.reduce_mean(total_losses, axis=0)
        ler_ops = tf.concat(axis=0, values=ler_ops)
        ler_op = tf.reduce_mean(ler_ops, axis=0)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers
        average_grads_and_vars = average_gradients(total_grads_and_vars)

        # Apply the gradients to adjust the shared variables.
        train_op = optimizer.apply_gradients(average_grads_and_vars,
                                             global_step=global_step)

        # Define learning rate controller
        lr_controller = Controller(
            learning_rate_init=params['learning_rate'],
            decay_start_epoch=params['decay_start_epoch'],
            decay_rate=params['decay_rate'],
            decay_patient_epoch=params['decay_patient_epoch'],
            lower_better=True)

        # Build the summary tensor based on the TensorFlow collection of
        # summaries
        summary_train = tf.summary.merge(model.summaries_train)
        summary_dev = tf.summary.merge(model.summaries_dev)

        # Add the variable initializer operation
        init_op = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        # Count total parameters
        parameters_dict, total_parameters = count_total_parameters(
            tf.trainable_variables())
        for parameter_name in sorted(parameters_dict.keys()):
            print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
        print("Total %d variables, %s M parameters" %
              (len(parameters_dict.keys()),
               "{:,}".format(total_parameters / 1000000)))

        csv_steps, csv_loss_train, csv_loss_dev = [], [], []
        csv_ler_train, csv_ler_dev = [], []
        # Create a session for running operation on the graph
        # NOTE: Start running operations on the Graph. allow_soft_placement
        # must be set to True to build towers on GPU, as some of the ops do not
        # have GPU implementations.
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                              log_device_placement=False)) as sess:

            # Instantiate a SummaryWriter to output summaries and the graph
            summary_writer = tf.summary.FileWriter(
                model.save_path, sess.graph)

            # Initialize parameters
            sess.run(init_op)

            # Train model
            start_time_train = time.time()
            start_time_epoch = time.time()
            start_time_step = time.time()
            cer_dev_best = 1
            not_improved_epoch = 0
            learning_rate = float(params['learning_rate'])
            for step, (data, is_new_epoch) in enumerate(train_data):

                # Create feed dictionary for next mini batch (train)
                inputs, labels, inputs_seq_len, _ = data
                feed_dict_train = {}
                for i_gpu in range(len(gpu_indices)):
                    feed_dict_train[model.inputs_pl_list[i_gpu]
                                    ] = inputs[i_gpu]
                    feed_dict_train[model.labels_pl_list[i_gpu]] = list2sparsetensor(
                        labels[i_gpu], padded_value=train_data.padded_value)
                    feed_dict_train[model.inputs_seq_len_pl_list[i_gpu]
                                    ] = inputs_seq_len[i_gpu]
                    feed_dict_train[model.keep_prob_pl_list[i_gpu]
                                    ] = 1 - float(params['dropout'])
                feed_dict_train[learning_rate_pl] = learning_rate

                # Update parameters
                sess.run(train_op, feed_dict=feed_dict_train)

                if (step + 1) % int(params['print_step'] / len(gpu_indices)) == 0:

                    # Create feed dictionary for next mini batch (dev)
                    inputs, labels, inputs_seq_len,  _ = dev_data.next()[0]
                    feed_dict_dev = {}
                    for i_gpu in range(len(gpu_indices)):
                        feed_dict_dev[model.inputs_pl_list[i_gpu]
                                      ] = inputs[i_gpu]
                        feed_dict_dev[model.labels_pl_list[i_gpu]] = list2sparsetensor(
                            labels[i_gpu], padded_value=dev_data.padded_value)
                        feed_dict_dev[model.inputs_seq_len_pl_list[i_gpu]
                                      ] = inputs_seq_len[i_gpu]
                        feed_dict_dev[model.keep_prob_pl_list[i_gpu]] = 1.0

                    # Compute loss
                    loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
                    loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
                    csv_steps.append(step)
                    csv_loss_train.append(loss_train)
                    csv_loss_dev.append(loss_dev)

                    # Change to evaluation mode
                    for i_gpu in range(len(gpu_indices)):
                        feed_dict_train[model.keep_prob_pl_list[i_gpu]] = 1.0

                    # Compute accuracy & update event files
                    ler_train, summary_str_train = sess.run(
                        [ler_op, summary_train], feed_dict=feed_dict_train)
                    ler_dev, summary_str_dev = sess.run(
                        [ler_op, summary_dev], feed_dict=feed_dict_dev)
                    csv_ler_train.append(ler_train)
                    csv_ler_dev.append(ler_dev)
                    summary_writer.add_summary(summary_str_train, step + 1)
                    summary_writer.add_summary(summary_str_dev, step + 1)
                    summary_writer.flush()

                    duration_step = time.time() - start_time_step
                    print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" %
                          (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev,
                           learning_rate, duration_step / 60))
                    sys.stdout.flush()
                    start_time_step = time.time()

                # Save checkpoint and evaluate model per epoch
                if is_new_epoch:
                    duration_epoch = time.time() - start_time_epoch
                    print('-----EPOCH:%d (%.3f min)-----' %
                          (train_data.epoch, duration_epoch / 60))

                    # Save fugure of loss & ler
                    plot_loss(csv_loss_train, csv_loss_dev, csv_steps,
                              save_path=model.save_path)
                    plot_ler(csv_ler_train, csv_ler_dev, csv_steps,
                             label_type=params['label_type'],
                             save_path=model.save_path)

                    if train_data.epoch >= params['eval_start_epoch']:
                        start_time_eval = time.time()
                        print('=== Dev Data Evaluation ===')
                        cer_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_ops=decode_ops,
                            model=model,
                            dataset=dev_data,
                            label_type=params['label_type'],
                            train_data_size=params['train_data_size'],
                            eval_batch_size=1)
                        print('  CER: %f %%' % (cer_dev_epoch * 100))

                        if cer_dev_epoch < cer_dev_best:
                            cer_dev_best = cer_dev_epoch
                            not_improved_epoch = 0
                            print('■■■ ↑Best Score (CER)↑ ■■■')

                            # Save model only (check point)
                            checkpoint_file = join(
                                model.save_path, 'model.ckpt')
                            save_path = saver.save(
                                sess, checkpoint_file, global_step=train_data.epoch)
                            print("Model saved in file: %s" % save_path)
                        else:
                            not_improved_epoch += 1

                        duration_eval = time.time() - start_time_eval
                        print('Evaluation time: %.3f min' %
                              (duration_eval / 60))

                        # Early stopping
                        if not_improved_epoch == params['not_improved_patient_epoch']:
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=train_data.epoch,
                            value=cer_dev_epoch)

                    start_time_epoch = time.time()

            duration_train = time.time() - start_time_train
            print('Total time: %.3f hour' % (duration_train / 3600))

            # Training was finished correctly
            with open(join(model.model_dir, 'complete.txt'), 'w') as f:
                f.write('')
def do_train(model, params, gpu_indices):
    """Run training.
    Args:
        model: the model to train
        params (dict): A dictionary of parameters
        gpu_indices (list): GPU indices
    """
    # Load dataset
    train_data = Dataset(data_type='train',
                         train_data_size=params['train_data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size'],
                         max_epoch=params['num_epoch'],
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         sort_utt=True,
                         sort_stop_epoch=params['sort_stop_epoch'],
                         num_gpu=len(gpu_indices))
    dev_data = Dataset(data_type='dev',
                       train_data_size=params['train_data_size'],
                       label_type=params['label_type'],
                       batch_size=params['batch_size'],
                       splice=params['splice'],
                       num_stack=params['num_stack'],
                       num_skip=params['num_skip'],
                       sort_utt=False,
                       num_gpu=len(gpu_indices))
    # eval1_data = Dataset(
    #     data_type='eval1', train_data_size=params['train_data_size'],
    #     label_type=params['label_type'],
    #     batch_size=params['batch_size'], splice=params['splice'],
    #     num_stack=params['num_stack'], num_skip=params['num_skip'],
    #     sort_utt=False)
    # eval2_data = Dataset(
    #     data_type='eval2', train_data_size=params['train_data_size'],
    #     label_type=params['label_type'],
    #     batch_size=params['batch_size'], splice=params['splice'],
    #     num_stack=params['num_stack'], num_skip=params['num_skip'],
    #     sort_utt=False)
    # eval3_data = Dataset(
    #     data_type='eval3', train_data_size=params['train_data_size'],
    #     label_type=params['label_type'],
    #     batch_size=params['batch_size'], splice=params['splice'],
    #     num_stack=params['num_stack'], num_skip=params['num_skip'],
    #     sort_utt=False)

    # Tell TensorFlow that the model will be built into the default graph
    with tf.Graph().as_default(), tf.device('/cpu:0'):

        # Create a variable to track the global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set optimizer
        learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')
        optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl)

        # Calculate the gradients for each model tower
        total_grads_and_vars, total_losses = [], []
        decode_ops, ler_ops = [], []
        all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))]
        # NOTE: /cpu:0 is prepared for evaluation
        with tf.variable_scope(tf.get_variable_scope()):
            for i_gpu in range(len(all_devices)):
                with tf.device(all_devices[i_gpu]):
                    with tf.name_scope('tower_gpu%d' % i_gpu) as scope:

                        # Define placeholders in each tower
                        model.create_placeholders()

                        # Calculate the total loss for the current tower of the
                        # model. This function constructs the entire model but
                        # shares the variables across all towers.
                        tower_loss, tower_logits = model.compute_loss(
                            model.inputs_pl_list[i_gpu],
                            model.labels_pl_list[i_gpu],
                            model.inputs_seq_len_pl_list[i_gpu],
                            model.keep_prob_pl_list[i_gpu], scope)
                        tower_loss = tf.expand_dims(tower_loss, axis=0)
                        total_losses.append(tower_loss)

                        # Reuse variables for the next tower
                        tf.get_variable_scope().reuse_variables()

                        # Calculate the gradients for the batch of data on this
                        # tower
                        tower_grads_and_vars = optimizer.compute_gradients(
                            tower_loss)

                        # Gradient clipping
                        tower_grads_and_vars = model._clip_gradients(
                            tower_grads_and_vars)

                        # TODO: Optionally add gradient noise

                        # Keep track of the gradients across all towers
                        total_grads_and_vars.append(tower_grads_and_vars)

                        # Add to the graph each operation per tower
                        decode_op_tower = model.decoder(
                            tower_logits,
                            model.inputs_seq_len_pl_list[i_gpu],
                            beam_width=params['beam_width'])
                        decode_ops.append(decode_op_tower)
                        ler_op_tower = model.compute_ler(
                            decode_op_tower, model.labels_pl_list[i_gpu])
                        ler_op_tower = tf.expand_dims(ler_op_tower, axis=0)
                        ler_ops.append(ler_op_tower)

        # Aggregate losses, then calculate average loss
        total_losses = tf.concat(axis=0, values=total_losses)
        loss_op = tf.reduce_mean(total_losses, axis=0)
        ler_ops = tf.concat(axis=0, values=ler_ops)
        ler_op = tf.reduce_mean(ler_ops, axis=0)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers
        average_grads_and_vars = average_gradients(total_grads_and_vars)

        # Apply the gradients to adjust the shared variables.
        train_op = optimizer.apply_gradients(average_grads_and_vars,
                                             global_step=global_step)

        # Define learning rate controller
        lr_controller = Controller(
            learning_rate_init=params['learning_rate'],
            decay_start_epoch=params['decay_start_epoch'],
            decay_rate=params['decay_rate'],
            decay_patient_epoch=params['decay_patient_epoch'],
            lower_better=True)

        # Build the summary tensor based on the TensorFlow collection of
        # summaries
        summary_train = tf.summary.merge(model.summaries_train)
        summary_dev = tf.summary.merge(model.summaries_dev)

        # Add the variable initializer operation
        init_op = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        # Count total parameters
        parameters_dict, total_parameters = count_total_parameters(
            tf.trainable_variables())
        for parameter_name in sorted(parameters_dict.keys()):
            print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
        print("Total %d variables, %s M parameters" %
              (len(parameters_dict.keys()), "{:,}".format(
                  total_parameters / 1000000)))

        csv_steps, csv_loss_train, csv_loss_dev = [], [], []
        csv_ler_train, csv_ler_dev = [], []
        # Create a session for running operation on the graph
        # NOTE: Start running operations on the Graph. allow_soft_placement
        # must be set to True to build towers on GPU, as some of the ops do not
        # have GPU implementations.
        with tf.Session(
                config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)) as sess:

            # Instantiate a SummaryWriter to output summaries and the graph
            summary_writer = tf.summary.FileWriter(model.save_path, sess.graph)

            # Initialize parameters
            sess.run(init_op)

            # Train model
            start_time_train = time.time()
            start_time_epoch = time.time()
            start_time_step = time.time()
            ler_dev_best = 1
            not_improved_epoch = 0
            learning_rate = float(params['learning_rate'])
            for step, (data, is_new_epoch) in enumerate(train_data):

                # Create feed dictionary for next mini batch (train)
                inputs, labels, inputs_seq_len, _ = data
                feed_dict_train = {}
                for i_gpu in range(len(gpu_indices)):
                    feed_dict_train[
                        model.inputs_pl_list[i_gpu]] = inputs[i_gpu]
                    feed_dict_train[
                        model.labels_pl_list[i_gpu]] = list2sparsetensor(
                            labels[i_gpu],
                            padded_value=train_data.padded_value)
                    feed_dict_train[model.inputs_seq_len_pl_list[
                        i_gpu]] = inputs_seq_len[i_gpu]
                    feed_dict_train[
                        model.keep_prob_pl_list[i_gpu]] = 1 - float(
                            params['dropout'])
                feed_dict_train[learning_rate_pl] = learning_rate

                # Update parameters
                sess.run(train_op, feed_dict=feed_dict_train)

                if (step + 1) % int(
                        params['print_step'] / len(gpu_indices)) == 0:

                    # Create feed dictionary for next mini batch (dev)
                    (inputs, labels, inputs_seq_len, _), _ = dev_data.next()
                    feed_dict_dev = {}
                    for i_gpu in range(len(gpu_indices)):
                        feed_dict_dev[
                            model.inputs_pl_list[i_gpu]] = inputs[i_gpu]
                        feed_dict_dev[
                            model.labels_pl_list[i_gpu]] = list2sparsetensor(
                                labels[i_gpu],
                                padded_value=dev_data.padded_value)
                        feed_dict_dev[model.inputs_seq_len_pl_list[
                            i_gpu]] = inputs_seq_len[i_gpu]
                        feed_dict_dev[model.keep_prob_pl_list[i_gpu]] = 1.0

                    # Compute loss
                    loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
                    loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
                    csv_steps.append(step)
                    csv_loss_train.append(loss_train)
                    csv_loss_dev.append(loss_dev)

                    # Change to evaluation mode
                    for i_gpu in range(len(gpu_indices)):
                        feed_dict_train[model.keep_prob_pl_list[i_gpu]] = 1.0

                    # Compute accuracy & update event files
                    ler_train, summary_str_train = sess.run(
                        [ler_op, summary_train], feed_dict=feed_dict_train)
                    ler_dev, summary_str_dev = sess.run(
                        [ler_op, summary_dev], feed_dict=feed_dict_dev)
                    csv_ler_train.append(ler_train)
                    csv_ler_dev.append(ler_dev)
                    summary_writer.add_summary(summary_str_train, step + 1)
                    summary_writer.add_summary(summary_str_dev, step + 1)
                    summary_writer.flush()

                    duration_step = time.time() - start_time_step
                    print(
                        "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)"
                        % (step + 1, train_data.epoch_detail, loss_train,
                           loss_dev, ler_train, ler_dev, learning_rate,
                           duration_step / 60))
                    sys.stdout.flush()
                    start_time_step = time.time()

                # Save checkpoint and evaluate model per epoch
                if is_new_epoch:
                    duration_epoch = time.time() - start_time_epoch
                    print('-----EPOCH:%d (%.3f min)-----' %
                          (train_data.epoch, duration_epoch / 60))

                    # Save fugure of loss & ler
                    plot_loss(csv_loss_train,
                              csv_loss_dev,
                              csv_steps,
                              save_path=model.save_path)
                    plot_ler(csv_ler_train,
                             csv_ler_dev,
                             csv_steps,
                             label_type=params['label_type'],
                             save_path=model.save_path)

                    if train_data.epoch >= params['eval_start_epoch']:
                        start_time_eval = time.time()
                        print('=== Dev Data Evaluation ===')
                        # dev-clean
                        ler_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_ops=decode_ops,
                            model=model,
                            dataset=dev_data,
                            label_type=params['label_type'],
                            train_data_size=params['train_data_size'],
                            eval_batch_size=params['batch_size'])
                        print('  CER: %f %%' % (ler_dev_epoch * 100))

                        if ler_dev_epoch < ler_dev_best:
                            ler_dev_best = ler_dev_epoch
                            not_improved_epoch = 0
                            print('■■■ ↑Best Score (CER)↑ ■■■')

                            # Save model (check point)
                            checkpoint_file = join(model.save_path,
                                                   'model.ckpt')
                            save_path = saver.save(
                                sess,
                                checkpoint_file,
                                global_step=train_data.epoch)
                            print("Model saved in file: %s" % save_path)
                        else:
                            not_improved_epoch += 1

                        duration_eval = time.time() - start_time_eval
                        print('Evaluation time: %.3f min' %
                              (duration_eval / 60))

                        # Early stopping
                        if not_improved_epoch == params[
                                'not_improved_patient_epoch']:
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=train_data.epoch,
                            value=ler_dev_epoch)

                    start_time_epoch = time.time()

            duration_train = time.time() - start_time_train
            print('Total time: %.3f hour' % (duration_train / 3600))

            # Training was finished correctly
            with open(join(model.model_dir, 'complete.txt'), 'w') as f:
                f.write('')
def do_train(model, params):
    """Run training.
    Args:
        model: model to train
        params: A dictionary of parameters
    """
    # Load dataset
    train_data = Dataset(data_type='train',
                         label_type_main=params['label_type_main'],
                         label_type_sub=params['label_type_sub'],
                         train_data_size=params['train_data_size'],
                         batch_size=params['batch_size'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         sort_utt=True)
    dev_data_step = Dataset(data_type='dev',
                            label_type_main=params['label_type_main'],
                            label_type_sub=params['label_type_sub'],
                            train_data_size=params['train_data_size'],
                            batch_size=params['batch_size'],
                            num_stack=params['num_stack'],
                            num_skip=params['num_skip'],
                            sort_utt=False)
    dev_data_epoch = Dataset(data_type='dev',
                             label_type_main=params['label_type_main'],
                             label_type_sub=params['label_type_sub'],
                             train_data_size=params['train_data_size'],
                             batch_size=params['batch_size'],
                             num_stack=params['num_stack'],
                             num_skip=params['num_skip'],
                             sort_utt=False)

    # Tell TensorFlow that the model will be built into the default graph
    with tf.Graph().as_default():

        # Define placeholders
        model.create_placeholders(gpu_index=0)

        # Add to the graph each operation
        loss_op, logits_main, logits_sub = model.compute_loss(
            model.inputs_pl_list[0],
            model.labels_pl_list[0],
            model.labels_sub_pl_list[0],
            model.inputs_seq_len_pl_list[0],
            model.keep_prob_input_pl_list[0],
            model.keep_prob_hidden_pl_list[0],
            model.keep_prob_output_pl_list[0])
        train_op = model.train(loss_op,
                               optimizer=params['optimizer'],
                               learning_rate=model.learning_rate_pl_list[0])
        decode_op_main, decode_op_sub = model.decoder(
            logits_main,
            logits_sub,
            model.inputs_seq_len_pl_list[0],
            decode_type='beam_search',
            beam_width=20)
        ler_op_main, ler_op_sub = model.compute_ler(
            decode_op_main, decode_op_sub,
            model.labels_pl_list[0], model.labels_sub_pl_list[0])

        # Define learning rate controller
        lr_controller = Controller(
            learning_rate_init=params['learning_rate'],
            decay_start_epoch=params['decay_start_epoch'],
            decay_rate=params['decay_rate'],
            decay_patient_epoch=1,
            lower_better=True)

        # Build the summary tensor based on the TensorFlow collection of
        # summaries
        summary_train = tf.summary.merge(model.summaries_train)
        summary_dev = tf.summary.merge(model.summaries_dev)

        # Add the variable initializer operation
        init_op = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        # Count total parameters
        parameters_dict, total_parameters = count_total_parameters(
            tf.trainable_variables())
        for parameter_name in sorted(parameters_dict.keys()):
            print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
        print("Total %d variables, %s M parameters" %
              (len(parameters_dict.keys()),
               "{:,}".format(total_parameters / 1000000)))

        csv_steps, csv_loss_train, csv_loss_dev = [], [], []
        csv_ler_main_train, csv_ler_main_dev = [], []
        csv_ler_sub_train, csv_ler_sub_dev = [], []
        # Create a session for running operation on the graph
        with tf.Session() as sess:

            # Instantiate a SummaryWriter to output summaries and the graph
            summary_writer = tf.summary.FileWriter(
                model.save_path, sess.graph)

            # Initialize parameters
            sess.run(init_op)

            # Make mini-batch generator
            mini_batch_train = train_data.next_batch()
            mini_batch_dev = dev_data_step.next_batch()

            # Train model
            iter_per_epoch = int(train_data.data_num / params['batch_size'])
            train_step = train_data.data_num / params['batch_size']
            if (train_step) != int(train_step):
                iter_per_epoch += 1
            max_steps = iter_per_epoch * params['num_epoch']
            start_time_train = time.time()
            start_time_epoch = time.time()
            start_time_step = time.time()
            ler_main_dev_best = 1
            learning_rate = float(params['learning_rate'])
            for step in range(max_steps):

                # Create feed dictionary for next mini batch (train)
                inputs, labels_main, labels_sub, inputs_seq_len, _ = mini_batch_train.__next__()
                feed_dict_train = {
                    model.inputs_pl_list[0]: inputs,
                    model.labels_pl_list[0]: list2sparsetensor(labels_main, padded_value=-1),
                    model.labels_sub_pl_list[0]: list2sparsetensor(labels_sub, padded_value=-1),
                    model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                    model.keep_prob_input_pl_list[0]: model.dropout_ratio_input,
                    model.keep_prob_hidden_pl_list[0]: model.dropout_ratio_hidden,
                    model.keep_prob_output_pl_list[0]: model.dropout_ratio_output,
                    model.learning_rate_pl_list[0]: learning_rate
                }

                # Update parameters
                sess.run(train_op, feed_dict=feed_dict_train)

                if (step + 1) % 200 == 0:

                    # Create feed dictionary for next mini batch (dev)
                    inputs, labels_main, labels_sub, inputs_seq_len, _ = mini_batch_dev.__next__()
                    feed_dict_dev = {
                        model.inputs_pl_list[0]: inputs,
                        model.labels_pl_list[0]: list2sparsetensor(labels_main, padded_value=-1),
                        model.labels_sub_pl_list[0]: list2sparsetensor(labels_sub, padded_value=-1),
                        model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                        model.keep_prob_input_pl_list[0]: 1.0,
                        model.keep_prob_hidden_pl_list[0]: 1.0,
                        model.keep_prob_output_pl_list[0]: 1.0
                    }

                    # Compute loss
                    loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
                    loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
                    csv_steps.append(step)
                    csv_loss_train.append(loss_train)
                    csv_loss_dev.append(loss_dev)

                    # Change to evaluation mode
                    feed_dict_train[model.keep_prob_input_pl_list[0]] = 1.0
                    feed_dict_train[model.keep_prob_hidden_pl_list[0]] = 1.0
                    feed_dict_train[model.keep_prob_output_pl_list[0]] = 1.0

                    # Compute accuracy & update event file
                    ler_main_train, ler_sub_train, summary_str_train = sess.run(
                        [ler_op_main, ler_op_sub, summary_train],
                        feed_dict=feed_dict_train)
                    ler_main_dev, ler_sub_dev, summary_str_dev = sess.run(
                        [ler_op_main, ler_op_sub,  summary_dev],
                        feed_dict=feed_dict_dev)
                    csv_ler_main_train.append(ler_main_train)
                    csv_ler_main_dev.append(ler_main_dev)
                    csv_ler_sub_train.append(ler_sub_train)
                    csv_ler_sub_dev.append(ler_sub_dev)
                    summary_writer.add_summary(summary_str_train, step + 1)
                    summary_writer.add_summary(summary_str_dev, step + 1)
                    summary_writer.flush()

                    duration_step = time.time() - start_time_step
                    print('Step %d: loss = %.3f (%.3f) / ler_main = %.4f (%.4f) / ler_sub = %.4f (%.4f) (%.3f min)' %
                          (step + 1, loss_train, loss_dev, ler_main_train, ler_main_dev,
                           ler_sub_train, ler_sub_dev, duration_step / 60))
                    sys.stdout.flush()
                    start_time_step = time.time()

                # Save checkpoint and evaluate model per epoch
                if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps:
                    duration_epoch = time.time() - start_time_epoch
                    epoch = (step + 1) // iter_per_epoch
                    print('-----EPOCH:%d (%.3f min)-----' %
                          (epoch, duration_epoch / 60))

                    # Save model (check point)
                    checkpoint_file = join(model.save_path, 'model.ckpt')
                    save_path = saver.save(
                        sess, checkpoint_file, global_step=epoch)
                    print("Model saved in file: %s" % save_path)

                    if epoch >= 5:
                        start_time_eval = time.time()
                        print('=== Dev Evaluation ===')
                        ler_main_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_op=decode_op_main,
                            model=model,
                            dataset=dev_data_epoch,
                            label_type=params['label_type_main'],
                            eval_batch_size=params['batch_size'],
                            is_multitask=True,
                            is_main=True)
                        print('  CER (main): %f %%' %
                              (ler_main_dev_epoch * 100))

                        ler_sub_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_op=decode_op_sub,
                            model=model,
                            dataset=dev_data_epoch,
                            label_type=params['label_type_sub'],
                            eval_batch_size=params['batch_size'],
                            is_multitask=True,
                            is_main=False)
                        print('  CER (sub): %f %%' %
                              (ler_sub_dev_epoch * 100))

                        if ler_main_dev_epoch < ler_main_dev_best:
                            ler_main_dev_best = ler_main_dev_epoch
                            print('■■■ ↑Best Score (CER main)↑ ■■■')

                        duration_eval = time.time() - start_time_eval
                        print('Evaluation time: %.3f min' %
                              (duration_eval / 60))

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=epoch,
                            value=ler_main_dev_epoch)

                    start_time_epoch = time.time()
                    start_time_step = time.time()

            duration_train = time.time() - start_time_train
            print('Total time: %.3f hour' % (duration_train / 3600))

            # Save train & dev loss, ler
            save_loss(csv_steps, csv_loss_train, csv_loss_dev,
                      save_path=model.save_path)
            save_ler(csv_steps, csv_ler_main_train, csv_ler_sub_dev,
                     save_path=model.save_path)
            save_ler(csv_steps, csv_ler_sub_train, csv_ler_sub_dev,
                     save_path=model.save_path)

            # Training was finished correctly
            with open(join(model.save_path, 'complete.txt'), 'w') as f:
                f.write('')