Exemplo n.º 1
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master: wandb.init(project="transformer-evolution")

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model = transformer.QA(config)
    if os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank} load state dict from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)
    else:
        model.to(config.device)
    if master: wandb.watch(model)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True)
    test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
        score = eval_epoch(config, rank, model, test_loader)
        if master: wandb.log({"loss": loss, "accuracy": score})

        if master and best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
            if isinstance(model, DistributedDataParallel):
                model.module.save(best_epoch, best_loss, best_score, args.save)
            else:
                model.save(best_epoch, best_loss, best_score, args.save)
            print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}")

    if 1 < args.n_gpu:
        destroy_process_group()
Exemplo n.º 2
0
 def configure_optimizers(self):
     optimizer = torch.optim.AdamW(self.parameters(),
                                   lr=self.learning_rate,
                                   betas=(0.9, 0.999),
                                   eps=1e-8)
     lr_scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    self.n_warmup_steps,
                                                    self.n_training_steps)
     return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler}
Exemplo n.º 3
0
def train_model(rank, world_size, args):
    print('dd22')
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    # GPU 사용 여부를 확인합니다.
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    """학습 실행"""
    # BERTPretrain을 생성합니다.
    model = bert.BERTPretrain(config)
    # 기존에 학습된 pretrain 값이 있다면 이를 로드 합니다.
    if os.path.isfile(args.save):
        best_epoch, best_loss = model.bert.load(args.save)
        print(
            f"rank: {rank} load pretrain from: {args.save}, epoch={best_epoch}, loss={best_loss}"
        )
        best_epoch += 1
    # BERTPretrain이 GPU 또는 CPU를 지원하도록 합니다.
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    # MLM loss(criterion_lm) 및 NLP loss(criterion_cls) 함수를 선언 합니다.
    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader = data.build_pretrain_loader(vocab,
                                              args,
                                              epoch=best_epoch,
                                              shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # optimizer를 선언 합니다.
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.learning_rate,
                            eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    losses = []
    for step in trange(args.epoch, desc="Epoch"):
        print('step offset')
        print(step)
        print(offset)
        epoch = step + offset
        # 각 epoch 마다 새로 train_loader를 생성 합니다.
        # step이 0인 경우는 위에서 생성했기 때문에 생성하지 않습니다.
        if 0 < step:
            del train_loader
            train_loader = data.build_pretrain_loader(vocab,
                                                      args,
                                                      epoch=epoch,
                                                      shuffle=True)

        # 각 epoch 마다 학습을 합니다.
        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)
        losses.append(loss)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.bert.save(best_epoch, best_loss, args.save)
            else:
                model.bert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    print(f">>>> rank: {rank} losses: {losses}")
    if 1 < args.n_gpu:
        destroy_process_group()
Exemplo n.º 4
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu"
    print(config)

    best_epoch, best_loss = 0, 0
    train_model = ALBERTPretrain(config)
    if os.path.isfile(args.pretrain_save):
        try:
            best_epoch, best_loss = train_model.albert.load(args.pretrain_save)
            print(
                f"load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={best_epoch}, loss={best_loss:.4f}"
            )
        except:
            print(f'load {os.path.basename(args.pretrain_save)} failed.')

    if 1 < args.n_gpu:
        train_model.to(config.device)
        # noinspection PyArgumentList
        train_model = DistributedDataParallel(train_model,
                                              device_ids=[rank],
                                              find_unused_parameters=True)
    else:
        train_model.to(config.device)

    if master and args.wandb:
        wandb.watch(train_model)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader: DataLoader = data.build_pretrain_loader(vocab,
                                                          args,
                                                          shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in train_model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in train_model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    start_epoch = best_epoch + 1
    losses = []
    with trange(args.epoch, desc="Epoch", position=0) as pbar:
        pbar.set_postfix_str(
            f"best epoch: {best_epoch}, loss: {best_loss:.4f}")
        for step in pbar:
            epoch = step + start_epoch

            loss = train_epoch(config, rank, train_model, criterion_lm,
                               criterion_cls, optimizer, scheduler,
                               train_loader)
            losses.append(loss)
            if master and args.wandb:
                wandb.log({"loss": loss})

            if master:
                best_epoch, best_loss = epoch, loss
                if isinstance(train_model, DistributedDataParallel):
                    train_model.module.albert.save(best_epoch, best_loss,
                                                   args.pretrain_save)
                else:
                    train_model.albert.save(best_epoch, best_loss,
                                            args.pretrain_save)

                pbar.set_postfix_str(
                    f"best epoch: {best_epoch}, loss: {best_loss:.4f}")

    if 1 < args.n_gpu:
        destroy_process_group()
Exemplo n.º 5
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    model = albert.ALBERTPretrain(config)
    if os.path.isfile(args.save):
        model.albert.load(args.save)
        print(f"rank: {rank} load pretrain from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_pretrain_loader(vocab,
                                                             args,
                                                             shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.albert.save(best_epoch, best_loss, args.save)
            else:
                model.albert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    if 1 < args.n_gpu:
        destroy_process_group()
Exemplo n.º 6
0
def main(args):
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        logger.info("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    args.data_dir = os.path.join(args.data_dir, args.task_name)
    args.output_dir = os.path.join(args.output_dir, args.task_name)
    logger.info("args = %s", args)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        logger.info("Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        try:
            os.makedirs(args.output_dir)
        except:
            logger.info("catch a error")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # use bert to aug train_examples
    ori_train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)

    if args.double_ori == 0:
        num_train_optimization_steps = int(
            len(ori_train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
    else:
        num_train_optimization_steps = int(
            len(ori_train_examples) * 2 / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    config_class, tokenizer_class = (RobertaConfig, RobertaTokenizer)

    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.use_saved == 1:
        bert_saved_dir = args.ckpt
        if args.co_training:
            model_class = RobertaForNSP_co
            model = model_class.from_pretrained(bert_saved_dir, args=args)

        elif args.only_bert:
            model_class = RobertaForSequenceClassification
            model = model_class.from_pretrained(bert_saved_dir)
            tokenizer = tokenizer_class.from_pretrained(bert_saved_dir)
        else:
            model_class = RobertaForNSPAug
            model = model_class.from_pretrained(bert_saved_dir, args=args)

    else:
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=task_name,
            cache_dir=args.cache_dir if args.cache_dir else None)
        if args.only_bert:
            model_class = RobertaForSequenceClassification
            model = model_class.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None)
        else:
            model_class = RobertaForNSPAug
            model = model_class.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None,
                args=args)

    model.cuda()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    cnt = np.sum(np.prod(v.size())
                 for name, v in model.named_parameters()) / 1e6
    logger.info("cnt %s", str(cnt))

    if args.do_first_eval:
        args.do_train = False
        res_file = os.path.join(args.output_dir, "first_test.tsv")
        eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \
            do_evaluate(args, processor, label_list, tokenizer, model, 0, output_mode, num_labels, task_name,
                    eval_examples, type="dev")

        eval_res.update(res_parts)
        for key in sorted(eval_res.keys()):
            logger.info("first evaluation:  %s = %s", key, str(eval_res[key]))

        idx, preds = do_test(args, label_list, task_name, processor, tokenizer,
                             output_mode, model)

        dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds})
        dataframe.to_csv(res_file, index=False, sep='\t')
        logger.info("  Num test length = %d", idx)
        logger.info("  Done ")

        # write mm test results
        if task_name == "mnli":
            res_file = os.path.join(args.output_dir, "first_test_mm.tsv")

            idx, preds = do_test(args,
                                 label_list,
                                 task_name,
                                 processor,
                                 tokenizer,
                                 output_mode,
                                 model,
                                 do_mm=True)

            dataframe = pd.DataFrame({
                'index': range(idx),
                'prediction': preds
            })
            dataframe.to_csv(res_file, index=False, sep='\t')
            logger.info("  Num test length = %d", idx)
            logger.info("  Done write mm")

    if args.do_train:
        # Prepare optimizer
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(args.warmup_rate *
                                 num_train_optimization_steps),
            num_training_steps=num_train_optimization_steps)

        global_step = 0
        best_val_acc = 0.0
        first_time = time.time()

        logger.info("***** Running training *****")
        logger.info("  Num original examples = %d", len(ori_train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        aug_ratio = 0.0
        aug_seed = np.random.randint(0, 1000)
        for epoch in range(int(args.num_train_epochs)):
            if args.only_bert:
                train_features = convert_examples_to_features(
                    ori_train_examples,
                    label_list,
                    args.max_seq_length,
                    tokenizer,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    args=args,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0],
                    do_roberta=1)
            else:
                logger.info("epoch=%d,  aug_ratio = %f,  aug_seed=%d", epoch,
                            aug_ratio, aug_seed)
                train_examples = Aug_each_ckpt(
                    ori_train_examples,
                    label_list,
                    model,
                    tokenizer,
                    args=args,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    seed=aug_seed,
                    aug_ratio=aug_ratio,
                    use_bert=False,
                    do_roberta=1,
                    ssa_roberta=1,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0])
                if aug_ratio + args.aug_ratio_each < 1.0:
                    aug_ratio += args.aug_ratio_each
                aug_seed += 1

                train_features = convert_examples_to_features(
                    train_examples,
                    label_list,
                    args.max_seq_length,
                    tokenizer,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    args=args,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0],
                    do_roberta=1)

            logger.info("Done convert features")
            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_mode == "classification":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float)

            token_real_label = torch.tensor(
                [f.token_real_label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids,
                                       token_real_label)
            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            logger.info("begin training")
            tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0
            nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0
            preds = []
            all_labels = []
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, segment_ids, label_ids, token_real_label = batch
                if args.only_bert:
                    outputs = model(input_ids, input_mask)
                    seq_logits = outputs[0]
                else:
                    seq_logits, aug_logits, aug_loss = model(
                        input_ids,
                        input_mask,
                        labels=None,
                        token_real_label=token_real_label)
                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    seq_loss = loss_fct(seq_logits.view(-1, num_labels),
                                        label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    seq_loss = loss_fct(seq_logits.view(-1),
                                        label_ids.view(-1))

                token_real_label = token_real_label.detach().cpu().numpy()

                w = args.aug_loss_weight
                if args.only_bert:
                    loss = seq_loss
                else:
                    loss = (1 - w) * seq_loss + w * aug_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                total_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 10000.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                tr_seq_loss += seq_loss.mean().item()
                seq_logits = seq_logits.detach().cpu().numpy()
                label_ids = label_ids.detach().cpu().numpy()
                if len(preds) == 0:
                    preds.append(seq_logits)
                    all_labels.append(label_ids)
                else:
                    preds[0] = np.append(preds[0], seq_logits, axis=0)
                    all_labels[0] = np.append(all_labels[0], label_ids, axis=0)

                if args.only_bert == 0:
                    aug_logits = aug_logits.detach().cpu().numpy()
                    tmp_train_aug_accuracy, tmp_tokens = accuracy(
                        aug_logits, token_real_label, type="aug")
                    train_aug_accuracy += tmp_train_aug_accuracy
                    nb_tr_tokens += tmp_tokens
                    tr_aug_loss += aug_loss.mean().item()

                if global_step % 20 == 0:
                    loss = tr_loss / nb_tr_steps
                    seq_loss = tr_seq_loss / nb_tr_steps
                    aug_loss = tr_aug_loss / nb_tr_steps
                    tmp_pred = preds[0]
                    tmp_labels = all_labels[0]
                    if output_mode == "classification":
                        tmp_pred = np.argmax(tmp_pred, axis=1)
                    elif output_mode == "regression":
                        tmp_pred = np.squeeze(tmp_pred)
                    res = accuracy(tmp_pred, tmp_labels, task_name=task_name)

                    if nb_tr_tokens != 0:
                        aug_avg = train_aug_accuracy / nb_tr_tokens
                    else:
                        aug_avg = 0.0
                    log_string = ""
                    log_string += "epoch={:<5d}".format(epoch)
                    log_string += " step={:<9d}".format(global_step)
                    log_string += " total_loss={:<9.7f}".format(loss)
                    log_string += " seq_loss={:<9.7f}".format(seq_loss)
                    log_string += " aug_loss={:<9.7f}".format(aug_loss)
                    log_string += " lr={:<9.7f}".format(scheduler.get_lr()[0])
                    log_string += " |g|={:<9.7f}".format(total_norm)
                    #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg)
                    log_string += " tr_aug_acc={:<9.7f}".format(aug_avg)
                    log_string += " mins={:<9.2f}".format(
                        float(time.time() - first_time) / 60)
                    for key in sorted(res.keys()):
                        log_string += "  " + key + "= " + str(res[key])
                    logger.info(log_string)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1

            train_loss = tr_loss / nb_tr_steps

            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank()
                                 == 0) and epoch % 1 == 0:
                eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \
                    do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev")

                if "acc" in eval_res:
                    tmp_acc = eval_res["acc"]
                elif "mcc" in eval_res:
                    tmp_acc = eval_res["mcc"]
                else:
                    tmp_acc = eval_res["corr"]

                if tmp_acc >= best_val_acc:
                    best_val_acc = tmp_acc
                    dev_test = "dev"

                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_dir = os.path.join(args.output_dir,
                                                    "dev_" + str(tmp_acc))
                    if not os.path.exists(output_model_dir):
                        os.makedirs(output_model_dir)
                    model_to_save.save_pretrained(output_model_dir)
                    tokenizer.save_pretrained(output_model_dir)
                    output_model_file = os.path.join(output_model_dir,
                                                     'pytorch_model.bin')
                    torch.save(model_to_save.state_dict(), output_model_file)

                    result = {
                        'eval_total_loss': eval_loss,
                        'eval_seq_loss': eval_seq_loss,
                        'eval_aug_loss': eval_aug_loss,
                        'eval_aug_accuracy': eval_aug_accuracy,
                        'global_step': global_step,
                        'train_loss': train_loss,
                        'best_epoch': epoch,
                        'train_batch_size': args.train_batch_size,
                        'args': args
                    }

                    result.update(eval_res)
                    result.update(res_parts)

                    output_eval_file = os.path.join(
                        args.output_dir,
                        dev_test + "_results_" + str(tmp_acc) + ".txt")
                    with open(output_eval_file, "w") as writer:
                        logger.info("***** Test results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))

                    # write test results
                    if args.do_test:
                        res_file = os.path.join(
                            args.output_dir, "test_" + str(tmp_acc) + ".tsv")

                        idx, preds = do_test(args, label_list, task_name,
                                             processor, tokenizer, output_mode,
                                             model)

                        dataframe = pd.DataFrame({
                            'index': range(idx),
                            'prediction': preds
                        })
                        dataframe.to_csv(res_file, index=False, sep='\t')
                        logger.info("  Num test length = %d", idx)
                        logger.info("  Done ")

                        # write mm test results
                        if task_name == "mnli":
                            res_file = os.path.join(
                                args.output_dir, "mm_roberta_results_b_" +
                                str(tmp_acc) + ".tsv")

                            idx, preds = do_test(args,
                                                 label_list,
                                                 task_name,
                                                 processor,
                                                 tokenizer,
                                                 output_mode,
                                                 do_mm=True)

                            dataframe = pd.DataFrame({
                                'index': range(idx),
                                'prediction': preds
                            })
                            dataframe.to_csv(res_file, index=False, sep='\t')
                            logger.info("  Num test length = %d", idx)
                            logger.info("  Done write mm")

                else:
                    logger.info("  tmp_val_acc = %f", tmp_acc)
Exemplo n.º 7
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project, resume=args.name, tags=args.tags)

    if 1 < args.n_gpu:
        init_process_group(rank, world_size)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model: MovieClassification = transformer.MovieClassification(config)
    if args.resume and os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank}, last epoch: {best_epoch} load state dict from: {os.path.basename(args.save)}")
    model.to(config.device)

    if master and args.wandb:
        wandb.watch(model)

    if 1 < args.n_gpu:
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_train.json")), args, shuffle=True)
    test_loader, test_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_test.json")), args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, last_epoch=best_epoch)

    print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')
    with tqdm(initial=best_epoch + 1, total=args.epoch, position=0) as pbar:
        for epoch in range(best_epoch + 1, args.epoch + 1):
            if train_sampler:
                train_sampler.set_epoch(epoch)

            train_loss = train_epoch(args, config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
            test_loss, test_accuracy = eval_epoch(config, rank, model, test_loader, test_sampler)
            if master and args.wandb:
                wandb.config.update(args)
                wandb.log(row={"train loss": train_loss, "accuracy": test_accuracy}, step=epoch)

            if master:
                if best_score < test_accuracy:
                    best_epoch, best_loss, best_score = epoch, train_loss, test_accuracy
                    pbar.set_description(f'Best (score={best_score:.3f}, epoch={best_epoch})')
                    if isinstance(model, DistributedDataParallel):
                        model.module.save(best_epoch, best_loss, best_score, args.save)
                    else:
                        model.save(best_epoch, best_loss, best_score, args.save)
                else:
                    if best_epoch + 5 < epoch:  # early stop
                        break

            pbar.update()
            break
        print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')

    if master and args.wandb:
        wandb.save(args.name)
    if 1 < args.n_gpu:
        destroy_process_group()
Exemplo n.º 8
0
def train(config):
    # electra config 객체 생성
    electra_config = ElectraConfig.from_pretrained(config["train_model_path"], num_labels=config["num_labels"], cache_dir=config["cache_dir_path"])
    
    # electra tokenizer 객체 생성
    electra_tokenizer = ElectraTokenizer.from_pretrained(config["train_model_path"], do_lower_case=False, cache_dir=config["cache_dir_path"])

    # electra model 객체 생성
    electra_model = ElectraForSequenceClassification.from_pretrained(config["train_model_path"], config=electra_config, cache_dir=config["cache_dir_path"])

    # electra_model.cuda()

    # 학습 데이터 읽기
    train_datas = read_data(file_path=config["train_data_path"])

    # 학습 데이터 전처리
    train_dataset = convert_data2dataset(datas=train_datas, tokenizer=electra_tokenizer, max_length=config["max_length"])

    # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config["batch_size"])

    # 평가 데이터 읽기
    test_datas = read_data(file_path=config["test_data_path"])

    # 평가 데이터 전처리
    test_dataset = convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, max_length=config["max_length"])

    # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100)

    # 전체 학습 횟수(batch 단위)
    t_total = len(train_dataloader) // config["gradient_accumulation_steps"] * config["epoch"]

    # 모델 학습을 위한 optimizer
    optimizer = AdamW(electra_model.parameters(), lr=config["learning_rate"])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config["warmup_steps"], num_training_steps=t_total)

    if os.path.isfile(os.path.join(config["model_dir_path"], "optimizer.pt")) and os.path.isfile(
            os.path.join(config["model_dir_path"], "scheduler.pt")):
        # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴
        optimizer.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "scheduler.pt")))

    global_step = 0
    electra_model.zero_grad()
    max_test_accuracy = 0
    for epoch in range(config["epoch"]):
        electra_model.train()

        # 학습 데이터에 대한 정확도와 평균 loss
        train_accuracy, average_loss, global_step = do_train(config=config, electra_model=electra_model,
                                                             optimizer=optimizer, scheduler= scheduler,
                                                             train_dataloader=train_dataloader,
                                                             epoch=epoch+1, global_step=global_step)

        print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))

        electra_model.eval()

        # 평가 데이터에 대한 정확도
        test_accuracy = do_evaluate(electra_model=electra_model, test_dataloader=test_dataloader, mode=config["mode"])

        print("test_accuracy : {}\n".format(round(test_accuracy, 4)))

        # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장
        if(max_test_accuracy < test_accuracy):
            max_test_accuracy = test_accuracy

            output_dir = os.path.join(config["model_dir_path"], "checkpoint-{}".format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            print("save model in checkpoint-{}\n".format(global_step))
            
            electra_config.save_pretrained(output_dir)
            electra_tokenizer.save_pretrained(output_dir)
            electra_model.save_pretrained(output_dir)
            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))