def main():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]
    # Prepare output directory
    if not args.do_eval:
        args.output_dir = os.path.join(
            args.output_dir,
            list(filter(None,
                        args.model.strip().split("/")))[-1] + "-" +
            datetime.now().strftime("%Y%m%d_%H%M%S"))
        os.mkdir(args.output_dir)
    logger = init_logger("souhu-text-match-2021", args.output_dir)
    logger.info(f"Output dir: {args.output_dir}")

    # # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")
    logger.info(f"Training arguments: {args}")

    set_seed(args)
    train_dataloader = create_batch_iter(args, "train", logger)
    valid_dataloader = create_batch_iter(args, "valid", logger)

    model_dir = "/home/zhuminghao/work/model/pt/longformer/"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    config = AutoConfig.from_pretrained(model_dir,
                                        num_labels=2,
                                        return_dict=True)
    model = LongformerForClassification(config, model_dir)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # param_optimizer = list(model.named_parameters())
    # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # optimizer_grouped_parameters = [
    #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    # ]

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    # scheduler = lr_scheduler.StepLR(optimizer, 2)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               factor=0.1,
                                               patience=2)
    pgd = PGD(model)
    K = 3

    # Train and evaluate
    global_step = 0
    best_dev_f1, best_epoch = float("-inf"), float("-inf")
    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

    train_loss2plot = []
    train_acc2plot = []
    train_f1_2plot = []
    eval_loss2plot = []
    eval_acc2plot = []
    eval_f1_2plot = []
    for epoch_ in trange(int(args.num_train_epochs), desc="Epoch", ascii=True):
        tr_loss = 0.
        train_logits = []
        train_labels = []

        model.train()

        # try:
        #     with tqdm(train_dataloader, desc=f"Epoch {epoch_ + 1} iteration", ascii=True, position=0) as tq:
        # tqdm 不单行显示,搜到一下两种解决方案,现方案是加入参数 ascii=True
        # https://blog.csdn.net/martinkeith/article/details/115668425
        # https://blog.csdn.net/weixin_42138078/article/details/81215207
        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc=f"Epoch {epoch_ + 1} iteration",
                     ascii=True)):
            # for step, batch in enumerate(tq):
            sources, targets, labels = batch
            inputs = list(zip(sources, targets))
            labels = torch.tensor([int(label) for label in labels],
                                  dtype=torch.long)
            pt_batch = tokenizer(inputs,
                                 padding=True,
                                 truncation="longest_first",
                                 max_length=args.max_seq_length,
                                 return_tensors="pt")
            pt_batch = pt_batch.to(device)
            labels = labels.to(device)

            outputs = model(**pt_batch, labels=labels, return_dict=True)
            train_logits.append(outputs.logits)
            train_labels.append(labels)

            loss = outputs.loss

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()  # 方向传播,得到正常的grad

            if args.do_adversarial:
                # 对抗训练
                pgd.backup_grad()

                for t in range(K):
                    pgd.attack(is_first_attack=(
                        t == 0))  # 在embedding上添加对抗扰动,first attack时备份param.data
                    if t != K - 1:
                        model.zero_grad()
                    else:
                        pgd.restore_grad()
                    adv_outputs = model(**pt_batch,
                                        labels=labels,
                                        return_dict=True)
                    adv_loss = adv_outputs.loss
                    if args.n_gpu > 1:
                        adv_loss = adv_loss.mean()
                    adv_loss.backward()  # 反向传播,并在正常grad基础上,累加对抗训练的梯度
                pgd.restore()  # 恢复embedding参数

            # 梯度下降,更新参数
            optimizer.step()
            optimizer.zero_grad()

            tr_loss += loss.item()
            global_step += 1

            if (step + 1) % args.gradient_accumulation_steps == 0:
                pass

            if (global_step + 1) % args.eval_step == 0:
                logger.info("***** Running evaluation *****")
                logger.info("  Process = {} iter {} step".format(
                    epoch_, global_step))
                logger.info("  Batch size = %d", args.eval_batch_size)
                logger.info(
                    f"next step learning rate = {optimizer.param_groups[0]['lr']:.8f}"
                )

                all_train_logits = torch.cat(train_logits, dim=0).cpu()
                all_train_labels = torch.cat(train_labels, dim=0).cpu()
                acc, prf = evaluate(all_train_logits, all_train_labels)

                train_loss2plot.append(loss.item())
                train_acc2plot.append(acc)
                train_f1_2plot.append(prf[2])

                loss = tr_loss / (step + 1)

                result = do_eval(args, model, tokenizer, valid_dataloader,
                                 device, epoch_, args.num_train_epochs, "eval",
                                 logger)
                scheduler.step(result["eval_loss"])
                eval_loss2plot.append(result["eval_loss"])
                eval_acc2plot.append(result["eval_acc"])
                eval_f1_2plot.append((result["eval_f1"]))

                result['global_step'] = global_step
                result['train_loss'] = loss

                result_to_file(result, output_eval_file, logger)

                if args.do_eval:
                    save_model = False
                else:
                    save_model = False
                    if result['eval_f1'] > best_dev_f1:
                        best_dev_f1 = result['eval_f1']
                        best_epoch = epoch_ + 1
                        save_model = True

                if save_model:
                    logger.info("***** Save model *****")
                    best_model = model
                    model_to_save = model.module if hasattr(
                        best_model, 'module') else best_model

                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    output_config_file = os.path.join(args.output_dir,
                                                      "config.json")

                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)
        # except KeyboardInterrupt:
        #     tq.close()
        #     raise
        # tq.close()

    logger.info(f"best epoch: {best_epoch}, best eval f1:{best_dev_f1:.4f}")

    loss_acc_plot([
        train_loss2plot, train_acc2plot, train_f1_2plot, eval_loss2plot,
        eval_acc2plot, eval_f1_2plot
    ], os.path.join(args.output_dir, "loss_acc_f1.png"))
    logger.info(f"output dir: {args.output_dir}")
Exemplo n.º 2
0
import os

import torch
from tqdm import tqdm
from transformers import HfArgumentParser

from Io.data_loader import create_batch_iter
from config.arguments_utils import TrainingArguments
from preprocessing.data_processor import InputExample
from tools.Logginger import init_logger
from tools.model_util import set_seed, load_model

logger = init_logger("dureader2021-test", "output/logs/test/")


def predict_k_fold():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    logger.info(f"Training arguments: {args}")

    # Prepare devices
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)

    if "_" not in args.output_dir:
Exemplo n.º 3
0
import time
from tools.plot_util import loss_acc_f1_plot

import torch
from transformers import AdamW, get_linear_schedule_with_warmup

import config.args as args
from evaluate.acc_f1 import evaluate, evaluate_pos
from evaluate.loss import loss_fn
from tools.Logginger import init_logger
from tools.model_util import save_model
from tools.model_util import set_seed

logger = init_logger("torch", logging_path=args.log_path)

import warnings

warnings.filterwarnings('ignore')


def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x / warmup
    return 1.0 - x


def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1):
    # ------------------判断CUDA模式----------------------
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()  # 多GPU
def main():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    # Prepare output directory
    if not args.do_eval:
        args.output_dir = os.path.join(
            args.output_dir,
            list(filter(None,
                        args.model.strip().split("/")))[-1] + "-" +
            datetime.now().strftime("%Y%m%d_%H%M%S"))
        os.mkdir(args.output_dir)
    logger = init_logger("souhu-text-match-2021", args.output_dir)
    logger.info(f"Output dir: {args.output_dir}")

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)

    logger.info(f"Training arguments: {args}")

    if not args.do_eval:
        train_dataloader, num_train_steps = create_batch_iter(
            args, "train", logger)
    eval_dataloader, _ = create_batch_iter(args, "dev", logger)

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    # bert_config = AutoConfig.from_pretrained(args.model, return_dict=True)
    model = BertForSequenceClassification.from_pretrained(args.model)
    model.to(device)

    if args.do_eval:
        # model.eval()
        # result = do_eval(model, eval_dataloader, device, -1, -1)
        # logger.info("***** Eval results *****")
        # for key in sorted(result.keys()):
        #     logger.info("  %s = %s", key, str(result[key]))
        pass
    else:
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        # scheduler = lr_scheduler.StepLR(optimizer, 2)
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode='min',
                                                   factor=0.1,
                                                   patience=2)
        pgd = PGD(model)
        K = 3

        # Train and evaluate
        global_step = 0
        best_dev_f1, best_epoch = float("-inf"), float("-inf")
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

        train_loss2plot = []
        train_acc2plot = []
        train_f1_2plot = []
        eval_loss2plot = []
        eval_acc2plot = []
        eval_f1_2plot = []
        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0.
            train_logits = []
            train_labels = []

            model.train()

            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc=f"Epoch {epoch_ + 1} iteration",
                         ascii=True)):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, labels = batch

                outputs = model(input_ids,
                                input_mask,
                                segment_ids,
                                labels=labels,
                                return_dict=True)
                train_logits.append(outputs.logits)
                train_labels.append(labels)

                loss = outputs.loss

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()  # 方向传播,得到正常的grad

                if args.do_adversarial:
                    # 对抗训练
                    pgd.backup_grad()

                    for t in range(K):
                        pgd.attack(is_first_attack=(
                            t == 0
                        ))  # 在embedding上添加对抗扰动,first attack时备份param.data
                        if t != K - 1:
                            model.zero_grad()
                        else:
                            pgd.restore_grad()
                        adv_outputs = model(input_ids,
                                            input_mask,
                                            segment_ids,
                                            labels=labels,
                                            return_dict=True)
                        adv_loss = adv_outputs.loss
                        if args.n_gpu > 1:
                            adv_loss = adv_loss.mean()
                        adv_loss.backward()  # 反向传播,并在正常grad基础上,累加对抗训练的梯度
                    pgd.restore()  # 恢复embedding参数

                # 梯度下降,更新参数
                optimizer.step()
                optimizer.zero_grad()

                tr_loss += loss.item()
                global_step += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    pass

                if (global_step + 1) % args.eval_step == 0:
                    logger.info("***** Running evaluation *****")
                    logger.info("  Process = {} iter {} step".format(
                        epoch_, global_step))
                    logger.info("  Batch size = %d", args.eval_batch_size)
                    logger.info(
                        f"next step learning rate = {optimizer.param_groups[0]['lr']:.8f}"
                    )

                    all_train_logits = torch.cat(train_logits, dim=0).cpu()
                    all_train_labels = torch.cat(train_labels, dim=0).cpu()
                    acc, prf = evaluate(all_train_logits, all_train_labels)

                    train_loss2plot.append(loss.item())
                    train_acc2plot.append(acc)
                    train_f1_2plot.append(prf[2])

                    loss = tr_loss / (step + 1)

                    result = do_eval(args, model, eval_dataloader, device,
                                     epoch_, args.num_train_epochs, "eval",
                                     logger)
                    scheduler.step(result["eval_loss"])
                    eval_loss2plot.append(result["eval_loss"])
                    eval_acc2plot.append(result["eval_acc"])
                    eval_f1_2plot.append((result["eval_f1"]))

                    result['global_step'] = global_step
                    result['train_loss'] = loss

                    result_to_file(result, output_eval_file, logger)

                    if args.do_eval:
                        save_model = False
                    else:
                        save_model = False
                        if result['eval_f1'] > best_dev_f1:
                            best_dev_f1 = result['eval_f1']
                            best_epoch = epoch_ + 1
                            save_model = True

                    if save_model:
                        logger.info("***** Save model *****")
                        best_model = model
                        model_to_save = model.module if hasattr(
                            best_model, 'module') else best_model

                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        output_config_file = os.path.join(
                            args.output_dir, "config.json")

                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)

        logger.info(
            f"best epoch: {best_epoch}, best eval f1:{best_dev_f1:.4f}")

        loss_acc_plot([
            train_loss2plot, train_acc2plot, train_f1_2plot, eval_loss2plot,
            eval_acc2plot, eval_f1_2plot
        ], os.path.join(args.output_dir, "loss_acc_f1.png"))
        logger.info(f"output dir: {args.output_dir}")
Exemplo n.º 5
0
def predict():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    logger = init_logger("souhu-text-match-2021", "output/logs/")
    logger.info(f"!!!!!!Test arguments: {args}")

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)
    test_dataloader = create_batch_iter(args, "test", logger)

    args.output_dir = args.output_dir + sorted(os.listdir(
        args.output_dir))[-1]  # 最新一次训练结果
    logger.info(f"model {args.output_dir} predict useed")

    tokenizer = RoFormerTokenizer.from_pretrained(
        "/home/zhuminghao/work/model/pt/chinese_roformer_base")  # 没保存,所以用原始一样
    model = RoFormerForSequenceClassification.from_pretrained(args.output_dir)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    with torch.no_grad():
        test_logits = []
        ids = []
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="test", ascii=True)):
            sources, targets, bt_ids = batch
            inputs = list(zip(sources, targets))
            ids.append(bt_ids)

            pt_batch = tokenizer(inputs,
                                 padding=True,
                                 truncation="longest_first",
                                 max_length=args.max_seq_length,
                                 return_tensors="pt")
            pt_batch = pt_batch.to(device)

            outputs = model(**pt_batch, return_dict=True)

            logits = torch.max(outputs.logits, dim=1)[1]
            if device.type == "cuda":
                logits = logits.cpu().numpy().astype(int)
            else:
                logits = logits.numpy()
            test_logits.extend(logits.tolist())

        output_path = args.output_dir + "/test.csv"
        with open(output_path, "w", encoding="utf-8") as fw:
            for id, label in zip(ids, test_logits):
                fw.write(",".join([id, str(label)]) + "\n")
        logger.info(f"output path: {output_path}")
Exemplo n.º 6
0
def predict():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]

    logger = init_logger("souhu-text-match-2021", "output/logs/")
    logger.info(f"!!!!!!Test arguments: {args}")

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")

    set_seed(args)

    # if "_" not in args.output_dir:
    args.output_dir = args.output_dir + sorted(os.listdir(
        args.output_dir))[-1]  # 最新一次训练结果
    logger.info(f"model {args.output_dir} predict useed")

    test_dataloader, examples = create_batch_iter(args, "test", logger)

    model = BertForSequenceClassification.from_pretrained(args.output_dir)
    # model = load_model(args, device)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    with torch.no_grad():
        test_logits = []
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="test", ascii=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids = batch

            outputs = model(input_ids,
                            input_mask,
                            segment_ids,
                            return_dict=True)
            logits = torch.max(outputs.logits, dim=1)[1]
            if device.type == "cuda":
                logits = logits.cpu().numpy().astype(int)
            else:
                logits = logits.numpy()
            test_logits.extend(logits.tolist())

        pred_dt = {}
        output_path = args.output_dir + "/test.csv"
        with open(output_path, "w", encoding="utf-8") as fw:
            for exp, label in zip(examples, test_logits):
                exp: InputExample = exp
                _id = exp.label
                question = exp.text_a
                context = exp.text_b
                pred_dt[_id] = label
                fw.write(",".join([_id, str(label)]) + "\n")
        logger.info(f"output path: {output_path}")
Exemplo n.º 7
0
def main():
    parser = HfArgumentParser(TrainingArguments)
    args: TrainingArguments = parser.parse_args_into_dataclasses()[0]
    # Prepare output directory
    if not args.do_eval:
        args.output_dir = os.path.join(
            args.output_dir,
            list(filter(None,
                        args.model.strip().split("/")))[-1] + "-" +
            datetime.now().strftime("%Y%m%d_%H%M%S"))
        os.mkdir(args.output_dir)
    logger = init_logger("souhu-text-match-2021", args.output_dir)
    logger.info(f"Output dir: {args.output_dir}")

    # # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.n_gpu = 1

    logger.info(f"device: {device}, n_gpu: {args.n_gpu}")
    logger.info(f"Training arguments: {args}")

    set_seed(args)
    train_dataloader = create_batch_iter(args, "train", logger)
    valid_dataloader = create_batch_iter(args, "valid", logger)

    model_dir = "C:\dh\model\pt\chinese_roformer_base"
    tokenizer = RoFormerTokenizer.from_pretrained(model_dir)
    model = RoFormerForSequenceClassification.from_pretrained(model_dir)
    model.to(device)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    # scheduler = lr_scheduler.StepLR(optimizer, 2)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               factor=0.1,
                                               patience=2)
    loss_func = torch.nn.CrossEntropyLoss()
    pgd = PGD(model)
    K = 3

    # Train and evaluate
    global_step = 0
    best_dev_f1, best_epoch = float("-inf"), float("-inf")
    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

    train_loss2plot = []
    train_acc2plot = []
    train_f1_2plot = []
    eval_loss2plot = []
    eval_acc2plot = []
    eval_f1_2plot = []
    for epoch_ in trange(int(args.num_train_epochs), desc="Epoch", ascii=True):
        tr_loss = 0.
        train_logits = []
        train_labels = []

        model.train()
        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc=f"Epoch {epoch_ + 1} iteration",
                     ascii=True)):
            sources, targets, labels = batch
            inputs = list(zip(sources, targets))
            labels = torch.tensor([int(label) for label in labels],
                                  dtype=torch.long).to(device)
            pt_batch = tokenizer(inputs,
                                 padding=True,
                                 truncation="longest_first",
                                 max_length=args.max_seq_length,
                                 return_tensors="pt")
            pt_batch = pt_batch.to(device)
            labels = labels.to(device)

            outputs = model(**pt_batch, labels=labels, return_dict=True)
            logits = outputs.logits
            loss = outputs.loss

            train_logits.append(logits)
            train_labels.append(labels)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # if args.gradient_accumulation_steps > 1:
            #     loss = loss / args.gradient_accumulation_steps

            loss.backward()  # 方向传播,得到正常的grad

            if args.do_adversarial:
                # 对抗训练
                pgd.backup_grad()

                for t in range(K):
                    pgd.attack(is_first_attack=(
                        t == 0))  # 在embedding上添加对抗扰动,first attack时备份param.data
                    if t != K - 1:
                        model.zero_grad()
                    else:
                        pgd.restore_grad()
                    adv_outputs = model(**pt_batch,
                                        labels=labels,
                                        return_dict=True)
                    adv_loss = adv_outputs.loss
                    if args.n_gpu > 1:
                        adv_loss = adv_loss.mean()
                    adv_loss.backward()  # 反向传播,并在正常grad基础上,累加对抗训练的梯度
                pgd.restore()  # 恢复embedding参数

            # 梯度下降,更新参数
            optimizer.step()
            optimizer.zero_grad()

            tr_loss += loss.item()
            global_step += 1

            if (step + 1) % args.gradient_accumulation_steps == 0:
                pass

            if (global_step + 1) % args.eval_step == 0:
                logger.info("***** Running evaluation *****")
                logger.info("  Process = {} iter {} step".format(
                    epoch_, global_step))
                logger.info("  Batch size = %d", args.eval_batch_size)
                logger.info(
                    f"next step learning rate = {optimizer.param_groups[0]['lr']:.8f}"
                )

                all_train_logits = torch.cat(train_logits, dim=0).cpu()
                all_train_labels = torch.cat(train_labels, dim=0).cpu()
                acc, prf = evaluate(all_train_logits, all_train_labels)

                train_loss2plot.append(loss.item())
                train_acc2plot.append(acc)
                train_f1_2plot.append(prf[2])

                loss = tr_loss / (step + 1)

                result = do_eval(args, model, tokenizer, valid_dataloader,
                                 device, epoch_, args.num_train_epochs, "eval",
                                 logger)
                scheduler.step(result["eval_loss"])
                eval_loss2plot.append(result["eval_loss"])
                eval_acc2plot.append(result["eval_acc"])
                eval_f1_2plot.append((result["eval_f1"]))

                result['global_step'] = global_step
                result['train_loss'] = loss

                result_to_file(result, output_eval_file, logger)

                if args.do_eval:
                    save_model = False
                else:
                    save_model = False
                    if result['eval_f1'] > best_dev_f1:
                        best_dev_f1 = result['eval_f1']
                        best_epoch = epoch_ + 1
                        save_model = True

                if save_model:
                    logger.info("***** Save model *****")
                    best_model = model
                    model_to_save = model.module if hasattr(
                        best_model, 'module') else best_model

                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    output_config_file = os.path.join(args.output_dir,
                                                      "config.json")

                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    # tokenizer.save_vocabulary(args.output_dir)

    logger.info(f"best epoch: {best_epoch}, best eval f1:{best_dev_f1:.4f}")

    loss_acc_plot([
        train_loss2plot, train_acc2plot, train_f1_2plot, eval_loss2plot,
        eval_acc2plot, eval_f1_2plot
    ], os.path.join(args.output_dir, "loss_acc_f1.png"))
    logger.info(f"output dir: {args.output_dir}")