def generate_data(label_type, model, batch_size=1):
    """
    Args:
        label_type: character or phone or multitask
        model: ctc or attention
    Returns:
        inputs: `[batch_size, max_time, feature_dim]`
        labels: `[batch_size]`
        inputs_seq_len: `[batch_size, frame_num]`
        labels_seq_len: `[batch_size]` (if model is attention)
    """
    # Make input data
    inputs, inputs_seq_len = read_wav('./sample/LDC93S1.wav',
                                      feature_type='logmelfbank',
                                      batch_size=batch_size)
    # inputs, inputs_seq_len = read_wav('../sample/LDC93S1.wav',
    #                            feature_type='mfcc',
    #                            batch_size=batch_size)

    if model == 'ctc':
        if label_type == 'character':
            transcript = read_text('./sample/LDC93S1.txt')
            transcript = ' ' + transcript.replace('.', '') + ' '
            labels = [alpha2num(transcript)] * batch_size

            # Convert to SparseTensor
            labels = list2sparsetensor(labels, padded_value=-1)
            return inputs, labels, inputs_seq_len

        elif label_type == 'phone':
            transcript = read_phone('./sample/LDC93S1.phn')
            labels = [phone2num(transcript)] * batch_size

            # Convert to SparseTensor
            labels = list2sparsetensor(labels, padded_value=-1)
            return inputs, labels, inputs_seq_len

        elif label_type == 'multitask':
            transcript_char = read_text('./sample/LDC93S1.txt')
            transcript_phone = read_phone('./sample/LDC93S1.phn')
            transcript_char = ' ' + transcript_char.replace('.', '') + ' '
            labels_char = [alpha2num(transcript_char)] * batch_size
            labels_phone = [phone2num(transcript_phone)] * batch_size

            # Convert to SparseTensor
            labels_char = list2sparsetensor(labels_char, padded_value=-1)
            labels_phone = list2sparsetensor(labels_phone, padded_value=-1)
            return inputs, labels_char, labels_phone, inputs_seq_len

    elif model == 'attention':
        if label_type == 'character':
            transcript = read_text('./sample/LDC93S1.txt')
            transcript = '<' + transcript.replace('.', '') + '>'
            labels = [alpha2num(transcript)] * batch_size
            labels_seq_len = [len(labels[0])] * batch_size
            return inputs, labels, inputs_seq_len, labels_seq_len

        elif label_type == 'phone':
            transcript = read_phone('./sample/LDC93S1.phn')
            transcript = '< ' + transcript + ' >'
            labels = [phone2num(transcript)] * batch_size
            labels_seq_len = [len(labels[0])] * batch_size
            return inputs, labels, inputs_seq_len, labels_seq_len

        elif label_type == 'multitask':
            transcript_char = read_text('./sample/LDC93S1.txt')
            transcript_phone = read_phone('./sample/LDC93S1.phn')
            transcript_char = '<' + transcript_char.replace('.', '') + '>'
            transcript_phone = '< ' + transcript_phone + ' >'
            labels_char = [alpha2num(transcript_char)] * batch_size
            labels_phone = [phone2num(transcript_phone)] * batch_size
            target_len_char = [len(labels_char[0])] * batch_size
            target_len_phone = [len(labels_phone[0])] * batch_size
            return (inputs, labels_char, labels_phone, inputs_seq_len,
                    target_len_char, target_len_phone)

    elif model == 'joint_ctc_attention':
        if label_type == 'character':
            transcript = read_text('./sample/LDC93S1.txt')
            att_transcript = '<' + transcript.replace('.', '') + '>'
            ctc_transcript = ' ' + transcript.replace('.', '') + ' '
            att_labels = [alpha2num(att_transcript)] * batch_size
            labels_seq_len = [len(att_labels[0])] * batch_size
            ctc_labels = [alpha2num(ctc_transcript)] * batch_size

            # Convert to SparseTensor
            ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1)
            return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels

        elif label_type == 'phone':
            transcript = read_phone('./sample/LDC93S1.phn')
            att_transcript = '< ' + transcript + ' >'
            att_labels = [phone2num(att_transcript)] * batch_size
            labels_seq_len = [len(att_labels[0])] * batch_size
            ctc_labels = [phone2num(transcript)] * batch_size

            # Convert to SparseTensor
            ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1)

            return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels
Exemplo n.º 2
0
    def check_training(self, model_type, label_type):
        print('----- ' + model_type + ', ' + label_type + ' -----')
        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            inputs, labels, inputs_seq_len, labels_seq_len = generate_data(
                label_type=label_type,
                model='attention',
                batch_size=batch_size)

            # Define placeholders
            inputs_pl = tf.placeholder(
                tf.float32,
                shape=[batch_size, None, inputs.shape[-1]],
                name='input')

            # `[batch_size, max_time]`
            labels_pl = tf.placeholder(tf.int32,
                                       shape=[None, None],
                                       name='label')

            # These are prepared for computing LER
            indices_true_pl = tf.placeholder(tf.int64, name='indices')
            values_true_pl = tf.placeholder(tf.int32, name='values')
            shape_true_pl = tf.placeholder(tf.int64, name='shape')
            labels_st_true_pl = tf.SparseTensor(indices_true_pl,
                                                values_true_pl, shape_true_pl)
            indices_pred_pl = tf.placeholder(tf.int64, name='indices')
            values_pred_pl = tf.placeholder(tf.int32, name='values')
            shape_pred_pl = tf.placeholder(tf.int64, name='shape')
            labels_st_pred_pl = tf.SparseTensor(indices_pred_pl,
                                                values_pred_pl, shape_pred_pl)
            inputs_seq_len_pl = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='inputs_seq_len')
            labels_seq_len_pl = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='labels_seq_len')
            keep_prob_input_pl = tf.placeholder(tf.float32,
                                                name='keep_prob_input')
            keep_prob_hidden_pl = tf.placeholder(tf.float32,
                                                 name='keep_prob_hidden')

            # Define model graph
            output_size = 26 + 2 if label_type == 'character' else 61 + 2
            # model = load(model_type=model_type)
            network = BLSTMAttetion(batch_size=batch_size,
                                    input_size=inputs[0].shape[1],
                                    encoder_num_unit=256,
                                    encoder_num_layer=2,
                                    attention_dim=128,
                                    decoder_num_unit=256,
                                    decoder_num_layer=1,
                                    embedding_dim=20,
                                    output_size=output_size,
                                    sos_index=output_size - 2,
                                    eos_index=output_size - 1,
                                    max_decode_length=50,
                                    attention_weights_tempareture=1,
                                    logits_tempareture=1,
                                    parameter_init=0.1,
                                    clip_grad=5.0,
                                    clip_activation_encoder=50,
                                    clip_activation_decoder=50,
                                    dropout_ratio_input=1.0,
                                    dropout_ratio_hidden=1.0,
                                    weight_decay=0,
                                    beam_width=0,
                                    time_major=False)

            # Add to the graph each operation
            loss_op, logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss(
                inputs_pl, labels_pl, inputs_seq_len_pl, labels_seq_len_pl,
                keep_prob_input_pl, keep_prob_hidden_pl)
            learning_rate = 1e-3
            train_op = network.train(loss_op,
                                     optimizer='rmsprop',
                                     learning_rate_init=learning_rate,
                                     is_scheduled=False)
            decode_op_train, decode_op_infer = network.decoder(
                decoder_outputs_train,
                decoder_outputs_infer,
                decode_type='greedy',
                beam_width=1)
            ler_op = network.compute_ler(labels_st_true_pl, labels_st_pred_pl)
            attention_weights = decoder_outputs_infer.attention_scores

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()), "{:,}".format(
                      total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                inputs_pl: inputs,
                labels_pl: labels,
                inputs_seq_len_pl: inputs_seq_len,
                labels_seq_len_pl: labels_seq_len,
                keep_prob_input_pl: network.dropout_ratio_input,
                keep_prob_hidden_pl: network.dropout_ratio_hidden,
                network.lr: learning_rate
            }

            with tf.Session() as sess:

                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 400
                start_time_global = time.time()
                start_time_step = time.time()
                ler_train_pre = 1
                not_improved_count = 0
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(network.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[keep_prob_input_pl] = 1.0
                        feed_dict[keep_prob_hidden_pl] = 1.0

                        # Predict class ids
                        predicted_ids_train, predicted_ids_infer = sess.run(
                            [decode_op_train, decode_op_infer],
                            feed_dict=feed_dict)

                        # Compute accuracy
                        feed_dict_ler = {
                            labels_st_true_pl:
                            list2sparsetensor(labels),
                            labels_st_pred_pl:
                            list2sparsetensor(predicted_ids_infer)
                        }
                        ler_train = sess.run(ler_op, feed_dict=feed_dict_ler)

                        duration_step = time.time() - start_time_step
                        print('Step %d: loss = %.3f / ler = %.4f (%.3f sec)' %
                              (step + 1, loss_train, ler_train, duration_step))
                        start_time_step = time.time()

                        # Visualize
                        if label_type == 'character':
                            print('True            : %s' %
                                  num2alpha(labels[0]))
                            print('Pred (Training) : <%s' %
                                  num2alpha(predicted_ids_train[0]))
                            print('Pred (Inference): <%s' %
                                  num2alpha(predicted_ids_infer[0]))
                        else:
                            print('True            : %s' %
                                  num2phone(labels[0]))
                            print('Pred (Training) : < %s' %
                                  num2phone(predicted_ids_train[0]))
                            print('Pred (Inference): < %s' %
                                  num2phone(predicted_ids_infer[0]))

                        if ler_train >= ler_train_pre:
                            not_improved_count += 1
                        else:
                            not_improved_count = 0
                        if not_improved_count >= 5:
                            print('Model is Converged.')
                            break
                        ler_train_pre = ler_train

                duration_global = time.time() - start_time_global
                print('Total time: %.3f sec' % (duration_global))
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                label_type,
                eos_index,
                eval_batch_size=None,
                is_progressbar=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        label_type: string, phone39 or phone48 or phone61
        eos_index: int, the index of <EOS> class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
    Returns:
        per_global: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    train_label_type = label_type
    data_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_global = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    phone2num_map_file_path = '../metrics/mapping_files/attention/phone2num_' + \
        train_label_type[5:7] + '.txt'
    phone2num_39_map_file_path = '../metrics/mapping_files/attention/phone2num_39.txt'
    phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini-batch
        inputs, att_labels_true, _, inputs_seq_len, _, _ = mini_batch.__next__(
        )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        if False:
            # Evaluate by 61 phones
            per_local = session.run(per_op, feed_dict=feed_dict)
            per_global += per_local * batch_size_each

        else:
            # Evaluate by 39 phones
            predicted_ids = session.run(decode_op, feed_dict=feed_dict)
            predicted_ids_phone39 = []
            labels_true_phone39 = []
            for i_batch in range(batch_size_each):
                # Convert from num to phone (-> list of phone strings)
                phone_pred_seq = num2phone(predicted_ids[i_batch],
                                           phone2num_map_file_path)
                phone_pred_list = phone_pred_seq.split(' ')

                # Mapping to 39 phones (-> list of phone strings)
                phone_pred_list = map_to_39phone(phone_pred_list,
                                                 train_label_type,
                                                 phone2phone_map_file_path)

                # Convert from phone to num (-> list of phone indices)
                phone_pred_list = phone2num(phone_pred_list,
                                            phone2num_39_map_file_path)
                predicted_ids_phone39.append(phone_pred_list)

                if data_label_type != 'phone39':
                    # Convert from num to phone (-> list of phone strings)
                    phone_true_seq = num2phone(att_labels_true[i_batch],
                                               phone2num_map_file_path)
                    phone_true_list = phone_true_seq.split(' ')

                    # Mapping to 39 phones (-> list of phone strings)
                    phone_true_list = map_to_39phone(
                        phone_true_list, train_label_type,
                        phone2phone_map_file_path)

                    # Convert from phone to num (-> list of phone indices)
                    phone_true_list = phone2num(phone_true_list,
                                                phone2num_39_map_file_path)
                    labels_true_phone39.append(phone_true_list)
                else:
                    labels_true_phone39 = att_labels_true

            # Compute edit distance
            labels_true_st = list2sparsetensor(labels_true_phone39,
                                               padded_value=eos_index)
            labels_pred_st = list2sparsetensor(predicted_ids_phone39,
                                               padded_value=eos_index)
            per_local = compute_edit_distance(session, labels_true_st,
                                              labels_pred_st)
            per_global += per_local * batch_size_each

    per_global /= dataset.data_num

    return per_global
    def next_batch(self, batch_size=None, session=None):
        """Make mini-batch.
        Args:
            batch_size: int, the size of mini-batch
            session:
        Returns:
            inputs: list of input data, size `[batch_size]`
            att_labels: list of target labels, size `[batch_size]`
            ctc_labels_st: list of SparseTensor of taret labels
            inputs_seq_len: list of length of inputs of size `[batch_size]`
            att_labels_seq_len: list of length of target labels of size
                `[batch_size]`
            input_names: list of file name of input data of size `[batch_size]`

            If num_gpu > 1, each return is divide into list of size `[num_gpu]`.
        """
        if session is None and self.num_gpu != 1:
            raise ValueError('Set session when using multiple GPUs.')

        if batch_size is None:
            batch_size = self.batch_size

        next_epoch_flag = False
        ctc_padded_value = -1

        while True:
            # sorted dataset
            if self.is_sorted:
                if len(self.rest) > batch_size:
                    data_indices = list(self.rest)[:batch_size]
                    self.rest -= set(data_indices)
                else:
                    data_indices = list(self.rest)
                    self.rest = set(range(0, self.data_num, 1))
                    next_epoch_flag = True
                    if self.data_type == 'train':
                        print('---Next epoch---')

            # not sorted dataset
            else:
                if len(self.rest) > batch_size:
                    # Randomly sample mini-batch
                    data_indices = random.sample(list(self.rest), batch_size)
                    self.rest -= set(data_indices)
                else:
                    data_indices = list(self.rest)
                    self.rest = set(range(0, self.data_num, 1))
                    next_epoch_flag = True
                    if self.data_type == 'train':
                        print('---Next epoch---')

                    # Shuffle selected mini-batch
                    random.shuffle(data_indices)

            # Compute max frame num in mini-batch
            max_frame_num = max(map(lambda x: x.shape[0],
                                    self.input_list[data_indices]))

            # Compute max target label length in mini-batch
            att_max_seq_len = max(
                map(len, self.att_label_list[data_indices]))
            ctc_max_seq_len = max(
                map(len, self.ctc_label_list[data_indices]))

            # Initialization
            inputs = np.zeros(
                (len(data_indices), max_frame_num, self.input_size),
                dtype=np.int32)
            # Padding with <EOS>
            att_labels = np.array([[self.eos_index] * att_max_seq_len]
                                  * len(data_indices), dtype=np.int32)
            ctc_labels = np.array([[ctc_padded_value] * ctc_max_seq_len]
                                  * len(data_indices), dtype=np.int32)
            inputs_seq_len = np.zeros((len(data_indices),), dtype=np.int32)
            att_labels_seq_len = np.zeros(
                (len(data_indices),), dtype=np.int32)
            input_names = list(
                map(lambda path: basename(path).split('.')[0],
                    np.take(self.input_paths, data_indices, axis=0)))

            # Set values of each data in mini-batch
            for i_batch, x in enumerate(data_indices):
                data_i = self.input_list[x]
                frame_num = data_i.shape[0]
                inputs[i_batch, :frame_num, :] = data_i
                att_labels[i_batch, :len(self.att_label_list[x])
                           ] = self.att_label_list[x]
                ctc_labels[i_batch, :len(
                    self.ctc_label_list[x])] = self.ctc_label_list[x]
                inputs_seq_len[i_batch] = frame_num
                att_labels_seq_len[i_batch] = len(self.att_label_list[x])

            ##########
            # GPU
            ##########
            if self.num_gpu > 1:
                divide_num = self.num_gpu
                if next_epoch_flag:
                    for i in range(self.num_gpu, 0, -1):
                        if len(self.rest) % i == 0:
                            divide_num = i
                            break
                    next_epoch_flag = False

                # Now we split the mini-batch data by num_gpu
                inputs = tf.split(inputs, divide_num, axis=0)
                att_labels = tf.split(att_labels, divide_num, axis=0)
                ctc_labels = tf.split(ctc_labels, divide_num, axis=0)
                inputs_seq_len = tf.split(inputs_seq_len, divide_num, axis=0)
                att_labels_seq_len = tf.split(
                    att_labels_seq_len, divide_num, axis=0)
                input_names = tf.split(input_names, divide_num, axis=0)

                # Convert from SparseTensor to numpy.ndarray
                inputs = list(map(session.run, inputs))
                att_labels = list(map(session.run, att_labels))
                ctc_labels = list(map(session.run, ctc_labels))
                ctc_labels_st = list(
                    map(list2sparsetensor,
                        ctc_labels, [ctc_padded_value] * len(ctc_labels)))
                inputs_seq_len = list(map(session.run, inputs_seq_len))
                att_labels_seq_len = list(map(session.run, att_labels_seq_len))
                input_names = list(map(session.run, input_names))
            else:
                ctc_labels_st = list2sparsetensor(ctc_labels,
                                                  padded_value=ctc_padded_value)

            yield (inputs, att_labels, ctc_labels_st, inputs_seq_len,
                   att_labels_seq_len, input_names)
def do_train(network, param):
    """Run training.
    Args:
        network: network to train
        param: A dictionary of parameters
    """
    # Load dataset
    train_data = Dataset(data_type='train',
                         label_type=param['label_type'],
                         train_data_size=param['train_data_size'],
                         batch_size=param['batch_size'],
                         num_stack=param['num_stack'],
                         num_skip=param['num_skip'],
                         is_sorted=True)
    dev_data_step = Dataset(data_type='dev',
                            label_type=param['label_type'],
                            train_data_size=param['train_data_size'],
                            batch_size=param['batch_size'],
                            num_stack=param['num_stack'],
                            num_skip=param['num_skip'],
                            is_sorted=False)
    dev_data_epoch = Dataset(data_type='dev',
                             label_type=param['label_type'],
                             train_data_size=param['train_data_size'],
                             batch_size=param['batch_size'],
                             num_stack=param['num_stack'],
                             num_skip=param['num_skip'],
                             is_sorted=False)

    # Tell TensorFlow that the model will be built into the default graph
    with tf.Graph().as_default():

        # Define placeholders
        network.inputs = tf.placeholder(tf.float32,
                                        shape=[None, None, network.input_size],
                                        name='input')
        indices_pl = tf.placeholder(tf.int64, name='indices')
        values_pl = tf.placeholder(tf.int32, name='values')
        shape_pl = tf.placeholder(tf.int64, name='shape')
        network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl)
        network.inputs_seq_len = tf.placeholder(tf.int64,
                                                shape=[None],
                                                name='inputs_seq_len')
        network.keep_prob_input = tf.placeholder(tf.float32,
                                                 name='keep_prob_input')
        network.keep_prob_hidden = tf.placeholder(tf.float32,
                                                  name='keep_prob_hidden')

        # Add to the graph each operation (including model definition)
        loss_op, logits = network.compute_loss(network.inputs, network.labels,
                                               network.inputs_seq_len,
                                               network.keep_prob_input,
                                               network.keep_prob_hidden)
        train_op = network.train(loss_op,
                                 optimizer=param['optimizer'],
                                 learning_rate_init=float(
                                     param['learning_rate']),
                                 is_scheduled=False)
        decode_op = network.decoder(logits,
                                    network.inputs_seq_len,
                                    decode_type='beam_search',
                                    beam_width=20)
        ler_op = network.compute_ler(decode_op, network.labels)

        # Build the summary tensor based on the TensorFlow collection of
        # summaries
        summary_train = tf.summary.merge(network.summaries_train)
        summary_dev = tf.summary.merge(network.summaries_dev)

        # Add the variable initializer operation
        init_op = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver(max_to_keep=None)

        # Count total parameters
        parameters_dict, total_parameters = count_total_parameters(
            tf.trainable_variables())
        for parameter_name in sorted(parameters_dict.keys()):
            print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
        print("Total %d variables, %s M parameters" %
              (len(parameters_dict.keys()), "{:,}".format(
                  total_parameters / 1000000)))

        csv_steps, csv_train_loss, csv_dev_loss = [], [], []
        csv_ler_train, csv_ler_dev = [], []
        # Create a session for running operation on the graph
        with tf.Session() as sess:

            # Instantiate a SummaryWriter to output summaries and the graph
            summary_writer = tf.summary.FileWriter(network.model_dir,
                                                   sess.graph)

            # Initialize parameters
            sess.run(init_op)

            # Make mini-batch generator
            mini_batch_train = train_data.next_batch()
            mini_batch_dev = dev_data_step.next_batch()

            # Train model
            iter_per_epoch = int(train_data.data_num / param['batch_size'])
            train_step = train_data.data_num / param['batch_size']
            if (train_step) != int(train_step):
                iter_per_epoch += 1
            max_steps = iter_per_epoch * param['num_epoch']
            start_time_train = time.time()
            start_time_epoch = time.time()
            start_time_step = time.time()
            error_best = 1
            for step in range(max_steps):

                # Create feed dictionary for next mini batch (train)
                with tf.device('/cpu:0'):
                    inputs, labels, inputs_seq_len, _ = mini_batch_train.__next__(
                    )
                feed_dict_train = {
                    network.inputs: inputs,
                    network.labels: list2sparsetensor(labels, padded_value=-1),
                    network.inputs_seq_len: inputs_seq_len,
                    network.keep_prob_input: network.dropout_ratio_input,
                    network.keep_prob_hidden: network.dropout_ratio_hidden,
                    network.lr: float(param['learning_rate'])
                }

                # Update parameters
                sess.run(train_op, feed_dict=feed_dict_train)

                if (step + 1) % 200 == 0:

                    # Create feed dictionary for next mini batch (dev)
                    with tf.device('/cpu:0'):
                        inputs, labels, inputs_seq_len, _ = mini_batch_dev.__next__(
                        )
                    feed_dict_dev = {
                        network.inputs: inputs,
                        network.labels: list2sparsetensor(labels,
                                                          padded_value=-1),
                        network.inputs_seq_len: inputs_seq_len,
                        network.keep_prob_input: network.dropout_ratio_input,
                        network.keep_prob_hidden: network.dropout_ratio_hidden
                    }

                    # Compute loss_
                    loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
                    loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
                    csv_steps.append(step)
                    csv_train_loss.append(loss_train)
                    csv_dev_loss.append(loss_dev)

                    # Change to evaluation mode
                    feed_dict_train[network.keep_prob_input] = 1.0
                    feed_dict_train[network.keep_prob_hidden] = 1.0
                    feed_dict_dev[network.keep_prob_input] = 1.0
                    feed_dict_dev[network.keep_prob_hidden] = 1.0

                    # Compute accuracy & update event file
                    ler_train, summary_str_train = sess.run(
                        [ler_op, summary_train], feed_dict=feed_dict_train)
                    ler_dev, summary_str_dev = sess.run(
                        [ler_op, summary_dev], feed_dict=feed_dict_dev)
                    csv_ler_train.append(ler_train)
                    csv_ler_dev.append(ler_dev)
                    summary_writer.add_summary(summary_str_train, step + 1)
                    summary_writer.add_summary(summary_str_dev, step + 1)
                    summary_writer.flush()

                    duration_step = time.time() - start_time_step
                    print(
                        'Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)'
                        % (step + 1, loss_train, loss_dev, ler_train, ler_dev,
                           duration_step / 60))
                    sys.stdout.flush()
                    start_time_step = time.time()

                # Save checkpoint and evaluate model per epoch
                if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps:
                    duration_epoch = time.time() - start_time_epoch
                    epoch = (step + 1) // iter_per_epoch
                    print('-----EPOCH:%d (%.3f min)-----' %
                          (epoch, duration_epoch / 60))

                    # Save model (check point)
                    checkpoint_file = join(network.model_dir, 'model.ckpt')
                    save_path = saver.save(sess,
                                           checkpoint_file,
                                           global_step=epoch)
                    print("Model saved in file: %s" % save_path)

                    if epoch >= 5:
                        start_time_eval = time.time()
                        print('=== Dev Evaluation ===')
                        cer_dev_epoch = do_eval_cer(
                            session=sess,
                            decode_op=decode_op,
                            network=network,
                            dataset=dev_data_epoch,
                            label_type=param['label_type'],
                            eval_batch_size=param['batch_size'])
                        if param['label_type'] in ['kana', 'kanji']:
                            print('  CER: %f %%' % (cer_dev_epoch * 100))
                        else:
                            print('  PER: %f %%' % (cer_dev_epoch * 100))

                            if cer_dev_epoch < error_best:
                                error_best = cer_dev_epoch
                                print('■■■ ↑Best Score↑ ■■■')

                        duration_eval = time.time() - start_time_eval
                        print('Evaluation time: %.3f min' %
                              (duration_eval / 60))

                        start_time_epoch = time.time()
                        start_time_step = time.time()

            duration_train = time.time() - start_time_train
            print('Total time: %.3f hour' % (duration_train / 3600))

            # Save train & dev loss, ler
            save_loss(csv_steps,
                      csv_train_loss,
                      csv_dev_loss,
                      save_path=network.model_dir)
            save_ler(csv_steps,
                     csv_ler_train,
                     csv_ler_dev,
                     save_path=network.model_dir)

            # Training was finished correctly
            with open(join(network.model_dir, 'complete.txt'), 'w') as f:
                f.write('')
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                label_type,
                eos_index,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        label_type: string, phone39 or phone48 or phone61
        eos_index: int, the index of <EOS> class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Returns:
        per_mean: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    train_label_type = label_type
    eval_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_mean = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    train_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        train_label_type + '_to_num.txt'
    eval_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        train_label_type + '_to_num.txt'
    phone2num_39_map_file_path = '../metrics/mapping_files/ctc/phone39_to_num.txt'
    phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini-batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _, _ = mini_batch.__next__()
        else:
            inputs, _, labels_true, inputs_seq_len, _, _ = mini_batch.__next__(
            )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        # Evaluate by 39 phones
        predicted_ids = session.run(decode_op, feed_dict=feed_dict)

        labels_pred_mapped, labels_true_mapped = [], []
        for i_batch in range(batch_size_each):
            ###############
            # Hypothesis
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_pred_list = num2phone(
                predicted_ids[i_batch],
                train_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_pred_list = map_to_39phone(phone_pred_list, train_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_pred_list = phone2num(phone_pred_list,
                                        phone2num_39_map_file_path)
            labels_pred_mapped.append(phone_pred_list)

            ###############
            # Reference
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_true_list = num2phone(
                labels_true[i_batch], eval_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_true_list = map_to_39phone(phone_true_list, eval_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_true_list = phone2num(phone_true_list,
                                        phone2num_39_map_file_path)
            labels_true_mapped.append(phone_true_list)

        # Compute edit distance
        labels_true_st = list2sparsetensor(labels_true_mapped,
                                           padded_value=eos_index)
        labels_pred_st = list2sparsetensor(labels_pred_mapped,
                                           padded_value=eos_index)
        per_each = compute_edit_distance(session, labels_true_st,
                                         labels_pred_st)
        per_mean += per_each * batch_size_each

    per_mean /= dataset.data_num

    return per_mean