Exemplo n.º 1
0
def evaluate(logger, device, model, criterion, dev_data_loader):
    """
    验证集评估函数,分别计算f1、precision、recall
    """
    model.eval()
    start_time = time.time()
    loss_sum = 0.0
    correct_preds = 0
    all_predicts = []
    all_labels = []
    with torch.no_grad():
        for step, (batch_ids, batch_masks, batch_segments, batch_labels) in enumerate(tqdm(dev_data_loader)):
            ids, masks, segments, labels = batch_ids.to(device), batch_masks.to(device), batch_segments.to(
                device), batch_labels.to(device)
            logits, probabilities = model(ids, masks, segments)
            loss = criterion(logits, labels)
            loss_sum += loss.item()
            correct_preds += correct_predictions(probabilities, labels)
            predicts = torch.argmax(probabilities, dim=1)
            all_predicts.extend(predicts.cpu())
            all_labels.extend(batch_labels.cpu())
    val_time = time.time() - start_time
    val_loss = loss_sum / len(dev_data_loader)
    val_accuracy = correct_preds / len(dev_data_loader.dataset)
    val_measures = cal_metrics(all_predicts, all_labels)
    val_measures['accuracy'] = val_accuracy
    # 打印验证集上的指标
    res_str = ''
    for k, v in val_measures.items():
        res_str += (k + ': %.3f ' % v)
    logger.info('loss: %.5f, %s' % (val_loss, res_str))
    logger.info('time consumption of evaluating:%.2f(min)' % val_time)
    return val_measures, all_predicts
Exemplo n.º 2
0
def train(configs, data_manager, logger):
    domain_classes = data_manager.domain_class_number
    intent_classes = data_manager.intent_class_number
    slot_classes = data_manager.slot_class_number
    id2slot = data_manager.id2slot
    learning_rate = configs.learning_rate
    epoch = configs.epoch
    batch_size = configs.batch_size

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    bert_model = TFBertModel.from_pretrained('bert-base-chinese')
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    X_train, att_mask_train, domain_train, intent_train, slot_train, \
    X_val, att_mask_val, domain_val, intent_val, slot_val = data_manager.get_training_set()

    bilstm_crf_model = BiLSTM_CRFModel(configs, slot_classes)
    domain_model = DomainClassificationModel(configs, domain_classes)
    intent_model = IntentClassificationModel(configs, intent_classes)

    num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size))
    num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size))
    logger.info(('+' * 20) + 'training starting' + ('+' * 20))

    for i in range(epoch):
        start_time = time.time()
        logger.info('epoch:{}/{}'.format(i + 1, epoch))
        for iteration in tqdm(range(num_iterations)):
            X_train_batch, att_mask_train_batch, domain_train_batch, intent_train_batch, slot_train_batch \
                = data_manager.next_batch(X_train, att_mask_train, domain_train, intent_train, slot_train,
                                          start_index=iteration * batch_size)
            inputs_length = tf.math.count_nonzero(X_train_batch, 1)
            # 获得bert模型的输出
            bert_model_inputs = bert_model(X_train_batch, attention_mask=att_mask_train_batch)[0]
            with tf.GradientTape() as tape:
                # 槽位模型输入
                slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call(
                    inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_train_batch, training=1)
                slot_loss = -tf.reduce_mean(slot_log_likelihood)
                # 主题模型的输入
                domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :], training=1)
                domain_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=domain_logits,
                                                                                  y_true=domain_train_batch)
                domain_loss = tf.reduce_mean(domain_loss_vec)
                # 意图模型的输入
                intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :], training=1)
                intent_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=intent_logits,
                                                                                  y_true=intent_train_batch)
                intent_loss = tf.reduce_mean(intent_loss_vec)
                total_loss = domain_loss + intent_loss + 2 * slot_loss
            # 参数列表
            trainable_variables = bilstm_crf_model.trainable_variables + domain_model.trainable_variables + intent_model.trainable_variables
            # 定义好参加梯度的参数
            gradients = tape.gradient(total_loss, trainable_variables)
            # 反向传播,自动微分计算
            optimizer.apply_gradients(zip(gradients, trainable_variables))

            if iteration % configs.print_per_batch == 0 and iteration != 0:
                domain_predictions = tf.argmax(domain_logits, axis=-1)
                intent_predictions = tf.argmax(intent_logits, axis=-1)
                domain_measures = cal_metrics(y_true=domain_train_batch, y_pred=domain_predictions)
                intent_measures = cal_metrics(y_true=intent_train_batch, y_pred=intent_predictions)
                batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length)
                slot_measures = cal_slots_metrics(X_train_batch, slot_train_batch, batch_pred_sequence, id2slot, tokenizer)
                domain_str = ''
                for k, v in domain_measures.items():
                    domain_str += (k + ': %.3f ' % v)
                logger.info('training batch: {}'.format (iteration))
                logger.info('domain_loss: %.5f, %s' % (domain_loss, domain_str))
                intent_str = ''
                for k, v in intent_measures.items():
                    intent_str += (k + ': %.3f ' % v)
                logger.info('intent_loss: %.5f, %s' % (intent_loss, intent_str))
                slot_str = ''
                for k, v in slot_measures.items():
                    slot_str += (k + ': %.3f ' % v)
                logger.info('slot_loss: %.5f, %s' % (slot_loss, slot_str))
        # validation
        logger.info('start evaluate engines...')
        slot_val_results = {'precision': 0, 'recall': 0, 'f1': 0}
        domain_val_results = {'precision': 0, 'recall': 0, 'f1': 0}
        intent_val_results = {'precision': 0, 'recall': 0, 'f1': 0}
        for iteration in tqdm(range(num_val_iterations)):
            X_val_batch, att_mask_val_batch, domain_val_batch, intent_val_batch, slot_val_batch \
                = data_manager.next_batch(X_val, att_mask_val, domain_val, intent_val, slot_val,
                                          start_index=iteration * batch_size)
            inputs_length = tf.math.count_nonzero(X_val_batch, 1)
            # 获得bert模型的输出
            bert_model_inputs = bert_model(X_val_batch, attention_mask=att_mask_val_batch)[0]
            # 槽位模型预测
            slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call(
                inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_val_batch)
            batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length)
            slot_measures = cal_slots_metrics(X_val_batch, slot_val_batch, batch_pred_sequence, id2slot, tokenizer)
            # 主题模型的预测
            domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :])
            domain_predictions = tf.argmax(domain_logits, axis=-1)
            domain_measures = cal_metrics(y_true=domain_val_batch, y_pred=domain_predictions)
            # 意图模型的预测
            intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :])
            intent_predictions = tf.argmax(intent_logits, axis=-1)
            intent_measures = cal_metrics(y_true=intent_val_batch, y_pred=intent_predictions)

            for k, v in slot_measures.items():
                slot_val_results[k] += v
            for k, v in domain_measures.items():
                domain_val_results[k] += v
            for k, v in intent_measures.items():
                intent_val_results[k] += v

        time_span = (time.time() - start_time) / 60
        val_slot_str = ''
        val_domain_str = ''
        val_intent_str = ''
        for k, v in slot_val_results.items():
            slot_val_results[k] /= num_val_iterations
            val_slot_str += (k + ': %.3f ' % slot_val_results[k])
        for k, v in domain_val_results.items():
            domain_val_results[k] /= num_val_iterations
            val_domain_str += (k + ': %.3f ' % domain_val_results[k])
        for k, v in intent_val_results.items():
            intent_val_results[k] /= num_val_iterations
            val_intent_str += (k + ': %.3f ' % intent_val_results[k])
        logger.info('slot: {}'.format(val_slot_str))
        logger.info('domain: {}'.format(val_domain_str))
        logger.info('intent: {}'.format(val_intent_str))
        logger.info('time consumption:%.2f(min)' % time_span)
Exemplo n.º 3
0
def train(data_manager, logger):
    embedding_dim = data_manager.embedding_dim
    num_classes = data_manager.max_label_number
    seq_length = data_manager.max_sequence_length

    train_file = classifier_config['train_file']
    dev_file = classifier_config['dev_file']
    train_df = pd.read_csv(train_file).sample(frac=1)

    if dev_file is '':
        # split the data into train and validation set
        train_df, dev_df = train_df[:int(len(train_df) * 0.9
                                         )], train_df[int(len(train_df) *
                                                          0.9):]
    else:
        dev_df = pd.read_csv(dev_file).sample(frac=1)

    train_dataset = data_manager.get_dataset(train_df, step='train')
    dev_dataset = data_manager.get_dataset(dev_df)

    vocab_size = data_manager.vocab_size

    embedding_method = classifier_config['embedding_method']
    if embedding_method == 'Bert':
        from transformers import TFBertModel
        bert_model = TFBertModel.from_pretrained(
            'bert-base-multilingual-cased')
    else:
        bert_model = None
    checkpoints_dir = classifier_config['checkpoints_dir']
    checkpoint_name = classifier_config['checkpoint_name']
    num_filters = classifier_config['num_filters']
    learning_rate = classifier_config['learning_rate']
    epoch = classifier_config['epoch']
    max_to_keep = classifier_config['max_to_keep']
    print_per_batch = classifier_config['print_per_batch']
    is_early_stop = classifier_config['is_early_stop']
    patient = classifier_config['patient']
    hidden_dim = classifier_config['hidden_dim']
    classifier = classifier_config['classifier']

    reverse_classes = {
        str(class_id): class_name
        for class_name, class_id in data_manager.class_id.items()
    }

    best_f1_val = 0.0
    best_at_epoch = 0
    unprocessed = 0
    batch_size = data_manager.batch_size
    very_start_time = time.time()
    loss_obj = FocalLoss() if classifier_config['use_focal_loss'] else None
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    # 载入模型
    if classifier == 'textcnn':
        from engines.models.textcnn import TextCNN
        model = TextCNN(seq_length, num_filters, num_classes, embedding_dim,
                        vocab_size)
    elif classifier == 'textrcnn':
        from engines.models.textrcnn import TextRCNN
        model = TextRCNN(seq_length, num_classes, hidden_dim, embedding_dim,
                         vocab_size)
    elif classifier == 'textrnn':
        from engines.models.textrnn import TextRNN
        model = TextRNN(seq_length, num_classes, hidden_dim, embedding_dim,
                        vocab_size)
    else:
        raise Exception('config model is not exist')
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        directory=checkpoints_dir,
        checkpoint_name=checkpoint_name,
        max_to_keep=max_to_keep)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    logger.info(('+' * 20) + 'training starting' + ('+' * 20))
    for i in range(epoch):
        start_time = time.time()
        logger.info('epoch:{}/{}'.format(i + 1, epoch))
        for step, batch in tqdm(
                train_dataset.shuffle(
                    len(train_dataset)).batch(batch_size).enumerate()):
            if embedding_method == 'Bert':
                X_train_batch, y_train_batch = batch
                X_train_batch = bert_model(X_train_batch)[0]
            else:
                X_train_batch, y_train_batch = batch

            with tf.GradientTape() as tape:
                logits = model(X_train_batch, training=1)
                if classifier_config['use_focal_loss']:
                    loss_vec = loss_obj.call(y_true=y_train_batch,
                                             y_pred=logits)
                else:
                    loss_vec = tf.keras.losses.categorical_crossentropy(
                        y_true=y_train_batch, y_pred=logits)
                loss = tf.reduce_mean(loss_vec)
            # 定义好参加梯度的参数
            gradients = tape.gradient(loss, model.trainable_variables)
            # 反向传播,自动微分计算
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))
            if step % print_per_batch == 0 and step != 0:
                predictions = tf.argmax(logits, axis=-1).numpy()
                y_train_batch = tf.argmax(y_train_batch, axis=-1).numpy()
                measures, _ = cal_metrics(y_true=y_train_batch,
                                          y_pred=predictions)
                res_str = ''
                for k, v in measures.items():
                    res_str += (k + ': %.3f ' % v)
                logger.info('training batch: %5d, loss: %.5f, %s' %
                            (step, loss, res_str))

        # validation
        logger.info('start evaluate engines...')
        y_true, y_pred = np.array([]), np.array([])
        loss_values = []

        for dev_batch in tqdm(dev_dataset.batch(batch_size)):
            if embedding_method == 'Bert':
                X_val_batch, y_val_batch = dev_batch
                X_val_batch = bert_model(X_val_batch)[0]
            else:
                X_val_batch, y_val_batch = dev_batch

            logits = model(X_val_batch)
            val_loss_vec = tf.keras.losses.categorical_crossentropy(
                y_true=y_val_batch, y_pred=logits)
            val_loss = tf.reduce_mean(val_loss_vec)
            predictions = tf.argmax(logits, axis=-1)
            y_val_batch = tf.argmax(y_val_batch, axis=-1)
            y_true = np.append(y_true, y_val_batch)
            y_pred = np.append(y_pred, predictions)
            loss_values.append(val_loss)

        measures, each_classes = cal_metrics(y_true=y_true, y_pred=y_pred)

        # 打印每一个类别的指标
        classes_val_str = ''
        for k, v in each_classes.items():
            if k in reverse_classes:
                classes_val_str += ('\n' + reverse_classes[k] + ': ' +
                                    str(each_classes[k]))
        logger.info(classes_val_str)
        # 打印损失函数
        val_res_str = 'loss: %.3f ' % np.mean(loss_values)
        for k, v in measures.items():
            val_res_str += (k + ': %.3f ' % measures[k])

        time_span = (time.time() - start_time) / 60

        logger.info('time consumption:%.2f(min), %s' %
                    (time_span, val_res_str))
        if measures['f1'] > best_f1_val:
            unprocessed = 0
            best_f1_val = measures['f1']
            best_at_epoch = i + 1
            checkpoint_manager.save()
            logger.info('saved the new best model with f1: %.3f' % best_f1_val)
        else:
            unprocessed += 1

        if is_early_stop:
            if unprocessed >= patient:
                logger.info(
                    'early stopped, no progress obtained within {} epochs'.
                    format(patient))
                logger.info('overall best f1 is {} at {} epoch'.format(
                    best_f1_val, best_at_epoch))
                logger.info('total training time consumption: %.3f(min)' %
                            ((time.time() - very_start_time) / 60))
                return
    logger.info('overall best f1 is {} at {} epoch'.format(
        best_f1_val, best_at_epoch))
    logger.info('total training time consumption: %.3f(min)' %
                ((time.time() - very_start_time) / 60))
Exemplo n.º 4
0
def train(device, logger):
    # 定义各个参数
    batch_size = 128
    epoch = 10
    learning_rate = 0.0004
    patience = 3
    print_per_batch = 40
    folds = 5
    test_predicts_folds = [0] * folds

    # 加载训练语料
    train_query_file = 'datasets/train/train.query.tsv'
    train_reply_file = 'datasets/train/train.reply.tsv'
    train_left = pd.read_csv(train_query_file, sep='\t', header=None)
    train_left.columns = ['id', 'query']
    train_right = pd.read_csv(train_reply_file, sep='\t', header=None)
    train_right.columns = ['id', 'id_sub', 'reply', 'label']
    train_data = train_left.merge(train_right, how='left')
    train_data['reply'] = train_data['reply'].fillna('好的')

    oof = np.zeros((len(train_data), 1))

    # 加载测试语料
    test_query_file = 'datasets/test/test.query.tsv'
    test_reply_file = 'datasets/test/test.reply.tsv'
    test_left = pd.read_csv(test_query_file, sep='\t', header=None, encoding='gbk')
    test_left.columns = ['id', 'query']
    test_right = pd.read_csv(test_reply_file, sep='\t', header=None, encoding='gbk')
    test_right.columns = ['id', 'id_sub', 'reply']
    test_data = test_left.merge(test_right, how='left')
    test_data['label'] = 666

    test_data_manger = DataPrecessForSentence(test_data, logger)
    logger.info('test_data_length:{}\n'.format(len(test_data_manger)))
    test_loader = DataLoader(test_data_manger, shuffle=False, batch_size=batch_size)

    # 交叉熵损失函数
    criterion = torch.nn.CrossEntropyLoss()

    # N折交叉验证
    gkf = GroupKFold(n_splits=5).split(X=train_data.reply, groups=train_data.id)

    for fold, (train_idx, valid_idx) in enumerate(gkf):
        best_f1 = 0.0
        logger.info('fold:{}/{}'.format(fold + 1, folds))
        train_data_manger = DataPrecessForSentence(train_data.iloc[train_idx], logger)
        logger.info('train_data_length:{}\n'.format(len(train_data_manger)))
        train_loader = DataLoader(train_data_manger, shuffle=True, batch_size=batch_size)

        val_data_manger = DataPrecessForSentence(train_data.iloc[valid_idx], logger)
        logger.info('dev_data_length:{}\n'.format(len(val_data_manger)))
        val_loader = DataLoader(val_data_manger, shuffle=False, batch_size=batch_size)

        model = BertwwmModel(device).to(device)
        params = list(model.parameters())
        optimizer = AdamW(params, lr=learning_rate)
        # 定义梯度策略
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=0)
        for i in range(epoch):
            train_start = time.time()
            logger.info('epoch:{}/{}'.format(i + 1, epoch))
            loss, loss_sum = 0.0, 0.0
            correct_preds = 0
            model.train()
            for step, (batch_ids, batch_masks, batch_segments, batch_labels) in enumerate(tqdm(train_loader)):
                ids, masks, segments, labels = batch_ids.to(device), batch_masks.to(device), batch_segments.to(
                    device), batch_labels.to(device)
                optimizer.zero_grad()
                logits, probabilities = model(ids, masks, segments)
                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()
                loss_sum += loss.item()
                correct_preds += correct_predictions(probabilities, labels)
                # 打印训练过程中的指标
                if step % print_per_batch == 0 and step != 0:
                    predicts = torch.argmax(probabilities, dim=1)
                    measures = cal_metrics(predicts.cpu(), labels.cpu())
                    res_str = ''
                    for k, v in measures.items():
                        res_str += (k + ': %.3f ' % v)
                    logger.info('training step: %5d, loss: %.5f, %s' % (step, loss, res_str))
            train_time = (time.time() - train_start) / 60
            train_accuracy = correct_preds / len(train_loader.dataset)
            scheduler.step(train_accuracy)
            logger.info('time consumption of training:%.2f(min)' % train_time)
            logger.info('start evaluate model...')
            val_measures, val_label_results = evaluate(logger, device, model, criterion, val_loader)

            patience_counter = 0
            if val_measures['f1'] >= best_f1 and val_measures['f1'] > 0.70:
                logger.info('find the new best model with f1 in fold %d: %.3f' % (fold + 1, best_f1))
                patience_counter = 0
                best_f1 = val_measures['f1']
                logger.info('start test model...')
                test_label_results = test(logger, device, model, test_loader)
                # 本次对test数据集的预测记录在test_predicts_folds中
                test_predicts_folds[fold] = test_label_results
                # 本次验证集的预测数据记录在oof中
                oof[valid_idx] = [[i] for i in val_label_results]
            else:
                patience_counter += 1
            if patience_counter >= patience:
                logger('Early stopping: patience limit reached, stopping...')
                break

    outputs = compute_output_arrays(train_data, 'label')
    best_f1, best_threshold = search_f1(outputs, oof)
    logger.info('best_f1 is %.3f, best_threshold is %.3f' % (best_f1, best_threshold))
    sub_predicts = np.average(test_predicts_folds, axis=0)
    sub_predicts = sub_predicts > best_threshold

    test_data['label'] = sub_predicts.astype(int)
    test_data[['id', 'id_sub', 'label']].to_csv('./submission_file/submission_bertwwm_esim_fgm.csv', index=False,
                                                header=None, sep='\t')
Exemplo n.º 5
0
def train(data_manager, logger):
    embedding_dim = data_manager.embedding_dim
    num_classes = data_manager.max_label_number
    seq_length = data_manager.max_sequence_length

    checkpoints_dir = classifier_config['checkpoints_dir']
    checkpoint_name = classifier_config['checkpoint_name']
    num_filters = classifier_config['num_filters']
    learning_rate = classifier_config['learning_rate']
    epoch = classifier_config['epoch']
    max_to_keep = classifier_config['max_to_keep']
    print_per_batch = classifier_config['print_per_batch']
    is_early_stop = classifier_config['is_early_stop']
    patient = classifier_config['patient']
    hidden_dim = classifier_config['hidden_dim']
    classifier = classifier_config['classifier']

    best_f1_val = 0.0
    best_at_epoch = 0
    unprocessed = 0
    batch_size = data_manager.batch_size
    very_start_time = time.time()
    loss_obj = FocalLoss() if classifier_config['use_focal_loss'] else None
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    X_train, y_train, X_val, y_val = data_manager.get_training_set()
    # 载入模型
    if classifier == 'textcnn':
        from engines.models.textcnn import TextCNN
        model = TextCNN(seq_length, num_filters, num_classes, embedding_dim)
    elif classifier == 'textrcnn':
        from engines.models.textrcnn import TextRCNN
        model = TextRCNN(seq_length, num_classes, hidden_dim, embedding_dim)
    else:
        raise Exception('config model is not exist')
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        directory=checkpoints_dir,
        checkpoint_name=checkpoint_name,
        max_to_keep=max_to_keep)
    num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size))
    num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size))
    logger.info(('+' * 20) + 'training starting' + ('+' * 20))
    for i in range(epoch):
        start_time = time.time()
        # shuffle train at each epoch
        sh_index = np.arange(len(X_train))
        np.random.shuffle(sh_index)
        X_train = X_train[sh_index]
        y_train = y_train[sh_index]
        logger.info('epoch:{}/{}'.format(i + 1, epoch))
        for iteration in tqdm(range(num_iterations)):
            X_train_batch, y_train_batch = data_manager.next_batch(
                X_train, y_train, start_index=iteration * batch_size)
            with tf.GradientTape() as tape:
                logits = model.call(X_train_batch, training=1)
                if classifier_config['use_focal_loss']:
                    loss_vec = loss_obj.call(y_true=y_train_batch,
                                             y_pred=logits)
                else:
                    loss_vec = tf.keras.losses.categorical_crossentropy(
                        y_true=y_train_batch, y_pred=logits)
                loss = tf.reduce_mean(loss_vec)
            # 定义好参加梯度的参数
            gradients = tape.gradient(loss, model.trainable_variables)
            # 反向传播,自动微分计算
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))
            if iteration % print_per_batch == 0 and iteration != 0:
                predictions = tf.argmax(logits, axis=-1)
                y_train_batch = tf.argmax(y_train_batch, axis=-1)
                measures = cal_metrics(y_true=y_train_batch,
                                       y_pred=predictions)
                res_str = ''
                for k, v in measures.items():
                    res_str += (k + ': %.3f ' % v)
                logger.info('training batch: %5d, loss: %.5f, %s' %
                            (iteration, loss, res_str))

        # validation
        logger.info('start evaluate engines...')
        val_results = {'precision': 0, 'recall': 0, 'f1': 0}
        for iteration in tqdm(range(num_val_iterations)):
            X_val_batch, y_val_batch = data_manager.next_batch(
                X_val, y_val, iteration * batch_size)
            logits = model.call(X_val_batch)
            predictions = tf.argmax(logits, axis=-1)
            y_val_batch = tf.argmax(y_val_batch, axis=-1)
            measures = cal_metrics(y_true=y_val_batch, y_pred=predictions)
            for k, v in measures.items():
                val_results[k] += v

        time_span = (time.time() - start_time) / 60
        val_res_str = ''
        dev_f1_avg = 0
        for k, v in val_results.items():
            val_results[k] /= num_val_iterations
            val_res_str += (k + ': %.3f ' % val_results[k])
            if k == 'f1':
                dev_f1_avg = val_results[k]
        logger.info('time consumption:%.2f(min), %s' %
                    (time_span, val_res_str))

        if np.array(dev_f1_avg).mean() > best_f1_val:
            unprocessed = 0
            best_f1_val = np.array(dev_f1_avg).mean()
            best_at_epoch = i + 1
            checkpoint_manager.save()
            logger.info('saved the new best model with f1: %.3f' % best_f1_val)
        else:
            unprocessed += 1

        if is_early_stop:
            if unprocessed >= patient:
                logger.info(
                    'early stopped, no progress obtained within {} epochs'.
                    format(patient))
                logger.info('overall best f1 is {} at {} epoch'.format(
                    best_f1_val, best_at_epoch))
                logger.info('total training time consumption: %.3f(min)' %
                            ((time.time() - very_start_time) / 60))
                return
    logger.info('overall best f1 is {} at {} epoch'.format(
        best_f1_val, best_at_epoch))
    logger.info('total training time consumption: %.3f(min)' %
                ((time.time() - very_start_time) / 60))