示例#1
0
 def configure_optimizers(self):
     # Prepare optimizer
     param_optimizer = list(self.model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer if not any(
             nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(
             nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=self.hparams.lr, correct_bias=False)
     # warm up lr
     num_workers = (self.hparams.gpus if self.hparams.gpus is not None else 1) * (self.hparams.num_nodes if self.hparams.num_nodes is not None else 1)
     data_len = len(self.train_dataloader().dataset)
     logging.info(f'number of workers {num_workers}, data length {data_len}')
     num_train_steps = int(data_len / (self.hparams.batch_size * num_workers) * self.hparams.max_epochs)
     logging.info(f'num_train_steps : {num_train_steps}')
     num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
     logging.info(f'num_warmup_steps : {num_warmup_steps}')
     scheduler = get_cosine_schedule_with_warmup(
         optimizer,
         num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
     lr_scheduler = {'scheduler': scheduler, 
                     'monitor': 'loss', 'interval': 'step',
                     'frequency': 1}
     return [optimizer], [lr_scheduler]
示例#2
0
def get_cosine_schedule_with_warmup(optimizer, epochs, batch_size, n_samples):
    warmup_proportion = 0.3
    n_steps = int(np.ceil(n_samples / batch_size))
    num_training_steps = n_steps * epochs
    num_warmup_steps = int(warmup_proportion * num_training_steps)
    sch = optimization.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    return sch
示例#3
0
    def set_train(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    param for name, param in self.model.named_parameters()
                    if not any(nd in name for nd in no_decay)
                ],
                'weight_decay':
                0.01
            },
            {
                'params': [
                    param for name, param in self.model.named_parameters()
                    if any(nd in name for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]
        num_total_train = len(self.train_loader) * self.num_epochs
        warmup_step = int(num_total_train * self.warmup_ratio)

        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=self.learning_rate)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.scheduler = get_cosine_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_step,
            num_training_steps=num_total_train)
示例#4
0
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.lr,
                          correct_bias=False)

        num_train_steps = len(
            self.train_dataloader()) * self.hparams.max_epochs
        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_steps)
        lr_scheduler = {
            'scheduler': scheduler,
            'name': 'cosine_schedule_with_warmup',
            'monitor': 'loss',
            'interval': 'step',
            'frequency': 1
        }
        return [optimizer], [lr_scheduler]
示例#5
0
 def init_fn(optimizer, epochs, batch_size, n_samples):
     n_steps = int(np.ceil(n_samples / batch_size))
     num_training_steps = n_steps * epochs
     num_warmup_steps = int(warmup_proportion * num_training_steps)
     sch = optimization.get_cosine_schedule_with_warmup(
         optimizer, num_warmup_steps, num_training_steps)
     update_in_batch, update_in_epoch = True, False
     return sch, update_in_batch, update_in_epoch
示例#6
0
文件: run.py 项目: kelvincjr/myRepo
def run():
    """train the model"""
    # set the logger
    utils.set_logger(config.log_dir)
    logging.info("device: {}".format(config.device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.process()
    logging.info("--------Process Done!--------")
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = load_dev('train')
    # build dataset
    train_dataset = NERDataset(word_train, label_train, config)
    dev_dataset = NERDataset(word_dev, label_dev, config)
    logging.info("--------Dataset Build!--------")
    # get dataset size
    train_size = len(train_dataset)
    # build data_loader
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
                              shuffle=True, collate_fn=train_dataset.collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
                            shuffle=True, collate_fn=dev_dataset.collate_fn)
    logging.info("--------Get Dataloader!--------")
    # Prepare model
    device = config.device
    model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id))
    model.to(device)
    # Prepare optimizer
    if config.full_fine_tuning:
        # model.named_parameters(): [bert, classifier, crf]
        bert_optimizer = list(model.bert.named_parameters())
        classifier_optimizer = list(model.classifier.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0},
            {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay},
            {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': 0.0},
            {'params': model.crf.parameters(), 'lr': config.learning_rate * 5}
        ]
    # only fine-tune the head classifier
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False)
    train_steps_per_epoch = train_size // config.batch_size
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch,
                                                num_training_steps=config.epoch_num * train_steps_per_epoch)

    # Train the model
    logging.info("--------Start Training!--------")
    train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)
示例#7
0
文件: nsmc.py 项目: kanggo-sw/KoBART
 def configure_optimizers(self):
     # Prepare optimizer
     param_optimizer = list(self.model.named_parameters())
     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in param_optimizer
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.01,
         },
         {
             "params": [
                 p for n, p in param_optimizer
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=self.hparams.lr,
                       correct_bias=False)
     # warm up lr
     num_workers = (self.hparams.gpus if self.hparams.gpus is not None else
                    1) * (self.hparams.num_nodes
                          if self.hparams.num_nodes is not None else 1)
     data_len = len(self.train_dataloader().dataset)
     logging.info(
         f"number of workers {num_workers}, data length {data_len}")
     num_train_steps = int(data_len /
                           (self.hparams.batch_size * num_workers *
                            self.hparams.accumulate_grad_batches) *
                           self.hparams.max_epochs)
     logging.info(f"num_train_steps : {num_train_steps}")
     num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
     logging.info(f"num_warmup_steps : {num_warmup_steps}")
     scheduler = get_cosine_schedule_with_warmup(
         optimizer,
         num_warmup_steps=num_warmup_steps,
         num_training_steps=num_train_steps,
     )
     lr_scheduler = {
         "scheduler": scheduler,
         "monitor": "loss",
         "interval": "step",
         "frequency": 1,
     }
     return [optimizer], [lr_scheduler]
示例#8
0
文件: train.py 项目: FFTYYY/Poem
def train(model , train_data , test_data):
	train_iter = DataSetIter(train_data , batch_size = C.batch_size)
	test_iter  = DataSetIter(test_data  , batch_size = C.batch_size)

	loss_func = nn.CrossEntropyLoss(ignore_index = 0)
	optim = tc.optim.Adam(params = model.parameters() , lr = C.lr , weight_decay = C.weight_decay)	
	scheduler = get_cosine_schedule_with_warmup(
		optim , 
		num_warmup_steps = C.warmup ,
		num_training_steps = train_iter.num_batches * C.epoch_number , 
	)

	best_test_loss 	= -1
	best_test_epoch = -1
	best_step 		= -1
	try:
		for epoch_n in range(C.epoch_number):
			tra_loss = run(model , train_iter , loss_func , epoch_n , optim , scheduler , True)
			tes_loss = run(model , test_iter , loss_func , epoch_n , None , None , False)

			logger.log ("Epoch %d ended. Train loss = %.4f , Valid loss = %.4f" % (
				epoch_n , tra_loss , tes_loss ,
			))
			fitlog.add_metric(
				tes_loss , 
				step = train_iter.num_batches * (epoch_n + 1) , 
				epoch = epoch_n , 
				name = "valid loss"
			)

			if best_test_epoch < 0 or tes_loss < best_test_loss:
				best_test_loss = tes_loss
				best_test_epoch = epoch_n
				best_step = fitlog_loss_step["train loss"]

				fitlog.add_best_metric(best_test_loss , name = "loss")
				with open(C.model_save , "wb") as fil:#暂时保存目前最好的模型
					pickle.dump(model , fil)
				fitlog.add_hyper(name = "best_step" , value =  "%d / %d" % (
					best_step ,
					train_iter.num_batches * C.epoch_number , 
				))

	except KeyboardInterrupt: # 手动提前停止
		pass

	logger.log ("Train end.")
	logger.log ("Got best valid loss %.4f in epoch %d" % (best_test_loss , best_test_epoch))

	return model
示例#9
0
 def __init__(
     self,
     optimizer: Optimizer,
     num_warmup_steps: int,
     num_training_steps: int,
     num_cycles: float = 0.5,
     last_epoch: int = -1,
 ) -> None:
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=num_warmup_steps,
         num_training_steps=num_training_steps,
         num_cycles=num_cycles,
         last_epoch=last_epoch,
     )
     super().__init__(lr_scheduler)
示例#10
0
    def _create_lr_scheduler(self) -> Dict:
        """Returns one of three default schedulers

        Possibilities: constant/linear/cosine schedule with or without warmup
        """
        steps_per_epoch = math.ceil(
            len(self._train_instances) / self._trainer_config.batch_size
        )
        try:
            training_steps = min(
                self._trainer_config.max_steps,
                self._trainer_config.max_epochs * steps_per_epoch,
            )
        # One or both of the max_* is None:
        except TypeError:
            training_steps = (
                self._trainer_config.max_steps
                # 1000 is the default of the lightning trainer
                or (self._trainer_config.max_epochs or 1000) * steps_per_epoch
            )

        if self._trainer_config.lr_decay == "linear":
            scheduler = get_linear_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
                num_training_steps=training_steps,
            )
        elif self._trainer_config.lr_decay == "cosine":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
                num_training_steps=training_steps,
            )
        else:
            scheduler = get_constant_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
            )

        return {
            "scheduler": scheduler,
            "interval": "step",
            "name": "learning_rate",
        }
示例#11
0
    def train(
        self,
        train_dataset,
        output_dir,
        show_running_loss=True,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        device = self.device

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
            num_workers=self.args.dataloader_num_workers,
        )

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = (
                args.max_steps
                // (len(train_dataloader) // args.gradient_accumulation_steps)
                + 1
            )
        else:
            t_total = (
                len(train_dataloader)
                // args.gradient_accumulation_steps
                * args.num_train_epochs
            )

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [
                p for n, p in model.named_parameters() if n in params
            ]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend(
                [
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names
                            and not any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": args.weight_decay,
                    },
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names
                            and any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": 0.0,
                    },
                ]
            )

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = (
            warmup_steps if args.warmup_steps == 0 else args.warmup_steps
        )

        if args.optimizer == "AdamW":
            optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adam_epsilon,
            )
        elif args.optimizer == "Adafactor":
            optimizer = Adafactor(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adafactor_eps,
                clip_threshold=args.adafactor_clip_threshold,
                decay_rate=args.adafactor_decay_rate,
                beta1=args.adafactor_beta1,
                weight_decay=args.weight_decay,
                scale_parameter=args.adafactor_scale_parameter,
                relative_step=args.adafactor_relative_step,
                warmup_init=args.adafactor_warmup_init,
            )
            print("Using Adafactor for T5")
        else:
            raise ValueError(
                "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format(
                    args.optimizer
                )
            )

        if args.scheduler == "constant_schedule":
            scheduler = get_constant_schedule(optimizer)

        elif args.scheduler == "constant_schedule_with_warmup":
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=args.warmup_steps
            )

        elif args.scheduler == "linear_schedule_with_warmup":
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
            )

        elif args.scheduler == "cosine_schedule_with_warmup":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup":
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "polynomial_decay_schedule_with_warmup":
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                lr_end=args.polynomial_decay_schedule_lr_end,
                power=args.polynomial_decay_schedule_power,
            )

        else:
            raise ValueError("{} is not a valid scheduler.".format(args.scheduler))

        if (
            args.model_name
            and os.path.isfile(os.path.join(args.model_name, "optimizer.pt"))
            and os.path.isfile(os.path.join(args.model_name, "scheduler.pt"))
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(args.model_name, "optimizer.pt"))
            )
            scheduler.load_state_dict(
                torch.load(os.path.join(args.model_name, "scheduler.pt"))
            )

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(
            int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0
        )
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args.model_name and os.path.exists(args.model_name):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args.model_name.split("/")[-1].split("-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) // args.gradient_accumulation_steps
                )
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) // args.gradient_accumulation_steps
                )

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d", epochs_trained)
                logger.info("   Continuing training from global step %d", global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch,
                )
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(**kwargs)

        if args.wandb_project:
            wandb.init(
                project=args.wandb_project,
                config={**asdict(args)},
                **args.wandb_kwargs,
            )
            wandb.run._label(repo="simpletransformers")
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for current_epoch in train_iterator:
            model.train()
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            train_iterator.set_description(
                f"Epoch {epoch_number + 1} of {args.num_train_epochs}"
            )
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                inputs = self._get_inputs_dict(batch)
                if args.fp16:
                    with amp.autocast():
                        outputs = model(**inputs)
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = outputs[0]
                else:
                    outputs = model(**inputs)
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = outputs[0]

                if args.n_gpu > 1:
                    loss = (
                        loss.mean()
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    batch_iterator.set_description(
                        f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}"
                    )

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    if args.optimizer == "AdamW":
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), args.max_grad_norm
                        )

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar(
                            "lr", scheduler.get_last_lr()[0], global_step
                        )
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args.logging_steps,
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args.wandb_project or self.is_sweeping:
                            wandb.log(
                                {
                                    "Training loss": current_loss,
                                    "lr": scheduler.get_last_lr()[0],
                                    "global_step": global_step,
                                }
                            )

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step)
                        )

                        self.save_model(
                            output_dir_current, optimizer, scheduler, model=model
                        )

                    if args.evaluate_during_training and (
                        args.evaluate_during_training_steps > 0
                        and global_step % args.evaluate_during_training_steps == 0
                    ):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_data,
                            verbose=verbose and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            try:
                                tb_writer.add_scalar(
                                    "eval_{}".format(key), value, global_step
                                )
                            except (NotImplementedError, AssertionError):
                                pass

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step)
                        )

                        if args.save_eval_checkpoints:
                            self.save_model(
                                output_dir_current,
                                optimizer,
                                scheduler,
                                model=model,
                                results=results,
                            )

                        training_progress_scores["global_step"].append(global_step)
                        training_progress_scores["train_loss"].append(current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(
                                args.output_dir, "training_progress_scores.csv"
                            ),
                            index=False,
                        )

                        if args.wandb_project or self.is_sweeping:
                            wandb.log(self._get_last_metrics(training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[args.early_stopping_metric]
                            self.save_model(
                                args.best_model_dir,
                                optimizer,
                                scheduler,
                                model=model,
                                results=results,
                            )
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if (
                                results[args.early_stopping_metric] - best_eval_metric
                                < args.early_stopping_delta
                            ):
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(
                                    args.best_model_dir,
                                    optimizer,
                                    scheduler,
                                    model=model,
                                    results=results,
                                )
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if (
                                        early_stopping_counter
                                        < args.early_stopping_patience
                                    ):
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        else:
                            if (
                                results[args.early_stopping_metric] - best_eval_metric
                                > args.early_stopping_delta
                            ):
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(
                                    args.best_model_dir,
                                    optimizer,
                                    scheduler,
                                    model=model,
                                    results=results,
                                )
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if (
                                        early_stopping_counter
                                        < args.early_stopping_patience
                                    ):
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        model.train()

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)
            )

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current, optimizer, scheduler, model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results = self.eval_model(
                    eval_data,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=args.evaluate_during_training_silent,
                    **kwargs,
                )

                if args.save_eval_checkpoints:
                    self.save_model(
                        output_dir_current, optimizer, scheduler, results=results
                    )

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(
                    os.path.join(args.output_dir, "training_progress_scores.csv"),
                    index=False,
                )

                if args.wandb_project or self.is_sweeping:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(
                        args.best_model_dir,
                        optimizer,
                        scheduler,
                        model=model,
                        results=results,
                    )
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if (
                        results[args.early_stopping_metric] - best_eval_metric
                        < args.early_stopping_delta
                    ):
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(
                            args.best_model_dir,
                            optimizer,
                            scheduler,
                            model=model,
                            results=results,
                        )
                        early_stopping_counter = 0
                    else:
                        if (
                            args.use_early_stopping
                            and args.early_stopping_consider_epochs
                        ):
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )
                else:
                    if (
                        results[args.early_stopping_metric] - best_eval_metric
                        > args.early_stopping_delta
                    ):
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(
                            args.best_model_dir,
                            optimizer,
                            scheduler,
                            model=model,
                            results=results,
                        )
                        early_stopping_counter = 0
                    else:
                        if (
                            args.use_early_stopping
                            and args.early_stopping_consider_epochs
                        ):
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )

        return (
            global_step,
            tr_loss / global_step
            if not self.args.evaluate_during_training
            else training_progress_scores,
        )
示例#12
0
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(len(train_dataloader) * 0.1),
    num_training_steps=len(train_dataloader))


def F1_scores(preds, golds, eps=1e-9):
    tp = (preds * golds).sum()
    tn = ((1 - golds) * (1 - preds)).sum()
    fp = ((1 - golds) * preds).sum()
    fn = (golds * (1 - preds)).sum()

    precision = tp / (tp + fp + eps)
    recall = tp / (tp + fn + eps)

    return 2 * (precision * recall) / (precision + recall + eps)

示例#13
0
    def train(self):

        wandb.init(entity='samjkwong', project='gmt')

        train_loader, val_loader, test_loader = self.load_dataloader()

        # Load Model & Optimizer
        self.model = self.load_model()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.args.lr, weight_decay = self.args.weight_decay)

        self.cls_criterion = torch.nn.BCEWithLogitsLoss()
        self.reg_criterion = torch.nn.MSELoss()

        if self.args.lr_schedule:
            self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, self.args.patience * len(train_loader), self.args.num_epochs * len(train_loader))

        logger, t_start = self.set_log()

        for epoch in trange(0, (self.args.num_epochs), desc = '[Epoch]', position = 1):

            self.model.train()
            total_loss = 0

            for _, data in enumerate(tqdm(train_loader, desc="[Iteration]")):

                if data.x.shape[0] == 1 or data.batch[-1] == 0: pass

                self.optimizer.zero_grad()
                data = data.to(self.args.device)
                out = self.model(data)

                is_labeled = data.y == data.y

                if "classification" in self.args.task_type: 
                    loss = self.cls_criterion(out.to(torch.float32)[is_labeled], data.y.to(torch.float32)[is_labeled])
                else:
                    loss = self.reg_criterion(out.to(torch.float32)[is_labeled], data.y.to(torch.float32)[is_labeled])

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_norm)
                total_loss += loss.item() * num_graphs(data)
                self.optimizer.step()

                if self.args.lr_schedule:
                    self.scheduler.step()

            total_loss = total_loss / len(train_loader.dataset)

            train_perf, valid_perf, test_perf = self.eval(train_loader), self.eval(val_loader), self.eval(test_loader)
            self.organize_log(logger, train_perf, valid_perf, test_perf, total_loss, epoch)

            # WANDB logging
            wandb.log({
                'Epoch': epoch,
                'Train Loss': total_loss,
                'Train ROC-AUC': train_perf,
                'Val ROC-AUC': valid_perf,
                'Test ROC-AUC': test_perf
            })

        t_end = time.perf_counter()

        if 'classification' in self.dataset.task_type:
            best_val_epoch = np.argmax(np.array(self.valid_curve))
            best_train = max(self.train_curve)
        else:
            best_val_epoch = np.argmin(np.array(self.valid_curve))
            best_train = min(self.train_curve)

        best_val = self.valid_curve[best_val_epoch]
        test_score = self.test_curve[best_val_epoch]

        logger.log("Train: {} Valid: {} Test: {} with Time: {}".format(best_train, best_val, test_score, (t_end - t_start)))

        result_file = "./results/{}/{}-results.txt".format(self.log_folder_name, self.exp_name)
        with open(result_file, 'a+') as f:
            f.write("{}: {} {} {} {}\n".format(self.args.seed, best_train, self.train_curve[best_val_epoch], best_val, test_score))

        torch.save({
            'model_state_dict': self.model.state_dict(),
            'Val': best_val,
            'Train': self.train_curve[best_val_epoch],
            'Test': test_score,
            'BestTrain': best_train
            }, './checkpoints/{}/best-model_{}.pth'.format(self.log_folder_name, self.args.seed))
    def train(self,
              training_data,
              student_model,
              student_optimizer,
              student_scheduler=None,
              tensorboard=None,
              num_epochs=20,
              log_interval=1e2,
              checkpoint_interval=1e5,
              iterations=0):
        # Parameters
        eta = 0.95  #scaling constant from MLE to RL loss
        learning_rate = 6e-4

        # Variables
        total_loss = 0.

        current_epoch, model, optimizer, scheduler = self.from_checkpoint_if_exists(
            student_model, student_optimizer, student_scheduler)
        if model is not None:
            student_model = model
        if optimizer is not None:
            student_optimizer = optimizer
        if scheduler is not None:
            student_scheduler = scheduler
        student_model.train()

        for epoch in range(current_epoch, num_epochs):
            total_mle_loss = 0.0
            num_chars_total = 0.0
            num_chars_correct = 0.0
            all_rewards = []

            optimizer = AdamW(student_model.parameters(), lr=learning_rate)

            if self.use_mle or self.use_rl:
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=4e4,
                    num_training_steps=len(training_data))

            for batch_idx, batch in enumerate(
                    tqdm(training_data, mininterval=2, leave=False)):
                batch_qs, batch_as = map(lambda x: x.to(self.device), batch)
                student_optimizer.zero_grad()

                if not self.use_mle:
                    policy_losses, batch_rewards = self.policy_batch_loss(
                        batch_qs, batch_as, student_model, gamma=0.9)
                if not self.use_rl:
                    mle_loss, num_correct, num_chars = self.mle_batch_loss(
                        batch_qs, batch_as, student_model.action_transformer)

                if self.use_mle:
                    loss = mle_loss
                elif self.use_rl:
                    loss = policy_losses
                else:
                    #TODO: why is there an /2 in the next line?
                    eta_linear_decay = eta - eta * (
                        iterations /
                        (float(len(training_data) * num_epochs) / 2))
                    loss = (1 - eta_linear_decay
                            ) * policy_losses + eta_linear_decay * mle_loss
                    iterations += batch_qs.shape[0]
                total_loss += loss
                loss.backward()
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(student_model.parameters(), 0.1)
                student_optimizer.step()
                if scheduler and self.use_mle:
                    scheduler.step()

                if not self.use_rl:
                    num_chars_total += num_chars
                    num_chars_correct += num_correct
                    total_mle_loss += mle_loss
                if not self.use_mle:
                    all_rewards.append(batch_rewards.cpu().numpy())

                if tensorboard is not None and batch_idx % log_interval == 0:
                    if self.use_mle:
                        self.tb_mle_batch(tensorboard, total_mle_loss,
                                          num_chars_total, num_chars_correct,
                                          epoch, batch_idx, len(training_data))
                    # TODO: Fix missing value_losses
                    #elif self.use_rl:
                    #    self.tb_policy_batch(tensorboard, batch_rewards, value_losses, epoch, batch_idx, len(training_data))
                    else:
                        self.tb_mle_policy_batch(tensorboard, total_mle_loss,
                                                 num_chars_total,
                                                 num_chars_correct,
                                                 batch_rewards, epoch,
                                                 batch_idx, len(training_data))
    def train(
        self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)

        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [p for n, p in model.named_parameters() if n in params]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend(
                [
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": args.weight_decay,
                    },
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": 0.0,
                    },
                ]
            )

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps

        if args.optimizer == "AdamW":
            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        elif args.optimizer == "Adafactor":
            optimizer = Adafactor(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adafactor_eps,
                clip_threshold=args.adafactor_clip_threshold,
                decay_rate=args.adafactor_decay_rate,
                beta1=args.adafactor_beta1,
                weight_decay=args.weight_decay,
                scale_parameter=args.adafactor_scale_parameter,
                relative_step=args.adafactor_relative_step,
                warmup_init=args.adafactor_warmup_init,
            )
            print("Using Adafactor for T5")
        else:
            raise ValueError(
                "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format(
                    args.optimizer
                )
            )

        if args.scheduler == "constant_schedule":
            scheduler = get_constant_schedule(optimizer)

        elif args.scheduler == "constant_schedule_with_warmup":
            scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps)

        elif args.scheduler == "linear_schedule_with_warmup":
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
            )

        elif args.scheduler == "cosine_schedule_with_warmup":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup":
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "polynomial_decay_schedule_with_warmup":
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                lr_end=args.polynomial_decay_schedule_lr_end,
                power=args.polynomial_decay_schedule_power,
            )

        else:
            raise ValueError("{} is not a valid scheduler.".format(args.scheduler))

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(**kwargs)

        if args.wandb_project:
            wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for _ in train_iterator:
            model.train()
            train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, labels, mc_labels, token_type_ids = batch

                if args.fp16:
                    with amp.autocast():
                        outputs = model(
                            input_ids,
                            token_type_ids=token_type_ids,
                            mc_token_ids=mc_token_ids,
                            mc_labels=mc_labels,
                            labels=labels,
                        )

                        lm_loss, mc_loss = outputs[:2]
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef
                else:
                    outputs = model(
                        input_ids,
                        token_type_ids=token_type_ids,
                        mc_token_ids=mc_token_ids,
                        mc_labels=mc_labels,
                        labels=labels,
                    )

                    lm_loss, mc_loss = outputs[:2]
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % current_loss, end="")

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    if args.optimizer == "AdamW":
                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                        logging_loss = tr_loss
                        if args.wandb_project or self.is_sweeping:
                            wandb.log(
                                {
                                    "Training loss": current_loss,
                                    "lr": scheduler.get_last_lr()[0],
                                    "global_step": global_step,
                                }
                            )

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        self.save_model(output_dir_current, model=model)

                    if args.evaluate_during_training and (
                        args.evaluate_during_training_steps > 0
                        and global_step % args.evaluate_during_training_steps == 0
                    ):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(
                            eval_dataloader,
                            verbose=verbose and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)

                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        if args.save_eval_checkpoints:
                            self.save_model(output_dir_current, model=model, results=results)

                        training_progress_scores["global_step"].append(global_step)
                        training_progress_scores["train_loss"].append(current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args.output_dir, "training_progress_scores.csv"), index=False,
                        )

                        if args.wandb_project or self.is_sweeping:
                            wandb.log(self._get_last_metrics(training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[args.early_stopping_metric]
                            self.save_model(args.best_model_dir, model=model, results=results)
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(args.best_model_dir, model=model, results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args.early_stopping_metric}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args.early_stopping_patience}")
                                    else:
                                        if verbose:
                                            logger.info(f" Patience of {args.early_stopping_patience} steps reached")
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        else:
                            if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(args.best_model_dir, model=model, results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args.early_stopping_metric}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args.early_stopping_patience}")
                                    else:
                                        if verbose:
                                            logger.info(f" Patience of {args.early_stopping_patience} steps reached")
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )

            epoch_number += 1
            output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current, model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results, _, _ = self.eval_model(
                    eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=True, **kwargs,
                )

                self.save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False)

                if args.wandb_project or self.is_sweeping:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(args.best_model_dir, model=model, results=results)
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir, model=model, results=results)
                        early_stopping_counter = 0
                else:
                    if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir, model=model, results=results)
                        early_stopping_counter = 0
                model.train()

        return (
            global_step,
            tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores,
        )
示例#16
0
def bert_train(opt):
    device = torch.device('cuda:{}'.format(opt.device))

    bertmodel, vocab = get_pytorch_kobert_model()

    dataset_train = nlp.data.TSVDataset('{}'.format(opt.source),
                                        field_indices=[1, 2],
                                        num_discard_samples=1)
    # dataset_test = nlp.data.TSVDataset('/content/tst.txt', field_indices=[1,2], num_discard_samples=1)

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    max_len = 256  # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
    batch_size = opt.batch
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5

    data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)

    # pytorch용 DataLoader 사용
    train_dataloader = torch.utils.data.DataLoader(data_train,
                                                   batch_size=batch_size,
                                                   num_workers=5)
    # test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

    model = BERTClassifier(bertmodel, dr_rate=0.2).to(device)
    # model = torch.load('weights/last_kobert_pytorch_model_big2s.pt')
    # if torch.cuda.device_count() > 1:
    # model = nn.DataParallel(model)
    model = nn.DataParallel(model, output_device=[0, 1])
    # model.to(device)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    # 옵티마이저 선언
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss(
    )  # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_step,
                                                num_training_steps=t_total)

    # lr_scheduler = optims.lr_scheduler.CosineAnnealingLR(optimizer,T_max=0.1,eta_min=0.0001)

    # 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
    def calc_accuracy(X, Y):
        max_vals, max_indices = torch.max(X, 1)
        train_acc = (max_indices
                     == Y).sum().data.cpu().numpy() / max_indices.size()[0]
        return train_acc

    # 모델 학습 시작
    for e in range(num_epochs):
        train_acc = 0.0
        test_acc = 0.0
        best_acc = 0.0
        model.train()
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(tqdm_notebook(train_dataloader)):
            optimizer.zero_grad()
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_grad_norm)  # gradient clipping
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            # lr_scheduler.step()
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(
                    e + 1, batch_id + 1,
                    loss.data.cpu().numpy(), train_acc / (batch_id + 1)))
            if train_acc > best_acc:
                best_acc = train_acc
            torch.save(model, '{}.pt'.format(opt.save_weights_name))
        print("epoch {} train acc {}".format(e + 1,
                                             train_acc / (batch_id + 1)))

    torch.save(model, '{}.pt'.format(opt.save_weights_name))
示例#17
0
    def train(self):

        self.overall_results = {
            'val_loss': [],
            'val_acc': [],
            'test_loss': [],
            'test_acc': [],
            'durations': []
        }

        train_fold_iter = tqdm(range(1, 11), desc='Training')
        val_fold_iter = [i for i in range(1, 11)]

        for fold_number in train_fold_iter:

            val_fold_number = val_fold_iter[fold_number - 2]

            train_loader, val_loader, test_loader = self.load_dataloader(
                fold_number, val_fold_number)

            # Load Model & Optimizer
            self.model = self.load_model()
            self.optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.args.lr,
                weight_decay=self.args.weight_decay)

            if self.args.lr_schedule:
                self.scheduler = get_cosine_schedule_with_warmup(
                    self.optimizer, self.args.patience * len(train_loader),
                    self.args.num_epochs * len(train_loader))

            logger, t_start = self.set_log(fold_number)

            # K-Fold Training
            for epoch in trange(0, (self.args.num_epochs),
                                desc='[Epoch]',
                                position=1):

                self.model.train()
                total_loss = 0

                for _, data in enumerate(train_loader):

                    self.optimizer.zero_grad()
                    data = data.to(self.args.device)
                    out = self.model(data)
                    loss = F.nll_loss(out, data.y)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.args.grad_norm)
                    total_loss += loss.item() * num_graphs(data)
                    self.optimizer.step()

                    if self.args.lr_schedule:
                        self.scheduler.step()

                total_loss = total_loss / len(train_loader.dataset)

                # Validation
                val_acc, val_loss = self.eval(val_loader)
                self.organize_val_log(logger, total_loss, val_loss, val_acc,
                                      fold_number, epoch)

                train_fold_iter.set_description(
                    '[Fold %d] TrL: %.2f VaL: %.2f VaAcc: %.2f%%' %
                    (fold_number, total_loss, val_loss, val_acc))
                train_fold_iter.refresh()

                if self.patience > self.args.patience:
                    break

            t_end = time.perf_counter()

            checkpoint = torch.load(
                './checkpoints/{}/experiment-{}_fold-{}_seed-{}_best-model.pth'
                .format(self.log_folder_name, self.exp_name, fold_number,
                        self.args.seed))
            self.model.load_state_dict(checkpoint)

            test_acc, test_loss = self.eval(test_loader)
            self.organize_test_log(logger, test_loss, test_acc, t_start, t_end,
                                   fold_number)

        final_result_file = "./results/{}/{}-total_results.txt".format(
            self.log_folder_name, self.exp_name)
        with open(final_result_file, 'a+') as f:
            f.write("{}: {} {} {} {}\n".format(
                self.args.seed,
                np.array(self.overall_results['val_acc']).mean(),
                np.array(self.overall_results['val_acc']).std(),
                np.array(self.overall_results['test_acc']).mean(),
                np.array(self.overall_results['test_acc']).std()))
示例#18
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \
            '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize

    if args.name:
        s_dir += '-' + args.name
    save_dir = increment_path(os.path.join(model_dir, s_dir))

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME == "monologg/kobert":
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    bert_config = BertConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                          config=bert_config)
    model.dropout = nn.Dropout(p=args.drop)
    model.to(device)

    summary(model)

    # loss & optimizer
    if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross':
        criterion = create_criterion(args.criterion,
                                     classes=args.num_labels,
                                     smoothing=0.1)
    else:
        criterion = create_criterion(args.criterion)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    if args.optimizer == 'AdamP':
        optimizer = AdamP(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.9, 0.999),
                          weight_decay=args.weight_decay)
    else:
        opt_module = getattr(import_module("torch.optim"),
                             args.optimizer)  # default: SGD
        optimizer = opt_module(
            optimizer_grouped_parameters,
            lr=args.lr,
        )

    # logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    set_neptune(save_dir, args)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # train, val split
    kfold = StratifiedKFold(n_splits=5)

    for train_idx, val_idx in kfold.split(dataset, labels):
        train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[
            val_idx]
        break

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=use_cuda,
    )

    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=12,
        num_workers=1,
        shuffle=False,
        pin_memory=use_cuda,
    )

    if args.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6)
    elif args.scheduler == 'reduce':
        scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
    elif args.scheduler == 'step':
        scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)
    elif args.scheduler == 'cosine_warmup':
        t_total = len(train_loader) * args.epochs
        warmup_step = int(t_total * args.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_step,
            num_training_steps=t_total)
    else:
        scheduler = None

    print("Training Start!!!")

    best_val_acc = 0
    best_val_loss = np.inf

    for epoch in range(args.epochs):
        # train loop
        model.train()

        train_loss, train_acc = AverageMeter(), AverageMeter()

        for idx, train_batch in enumerate(train_loader):
            optimizer.zero_grad()

            try:
                inputs, token_types, attention_mask, labels = train_batch.values(
                )
                inputs = inputs.to(device)
                token_types = token_types.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs,
                             token_type_ids=token_types,
                             attention_mask=attention_mask)
            except:
                inputs, attention_mask, labels = train_batch.values()
                inputs = inputs.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs, attention_mask=attention_mask)

            preds = torch.argmax(outs.logits, dim=-1)
            loss = criterion(outs.logits, labels)
            acc = (preds == labels).sum().item() / len(labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
            optimizer.step()

            if scheduler:
                scheduler.step()

            neptune.log_metric('learning_rate', get_lr(optimizer))

            train_loss.update(loss.item(), len(labels))
            train_acc.update(acc, len(labels))

            if (idx + 1) % args.log_interval == 0:
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}"
                )
                logger.add_scalar("Train/loss", train_loss.avg,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc.avg,
                                  epoch * len(train_loader) + idx)

        neptune.log_metric(f'Train_loss', train_loss.avg)
        neptune.log_metric(f'Train_avg', train_acc.avg)
        neptune.log_metric('learning_rate', current_lr)

        val_loss, val_acc = AverageMeter(), AverageMeter()
        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()

            for val_batch in val_loader:
                try:
                    inputs, token_types, attention_mask, labels = val_batch.values(
                    )
                    inputs = inputs.to(device)
                    token_types = token_types.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 token_type_ids=token_types,
                                 attention_mask=attention_mask)
                except:
                    inputs, attention_mask, labels = val_batch.values()
                    inputs = inputs.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 attention_mask=attention_mask)

                preds = torch.argmax(outs.logits, dim=-1)
                loss = criterion(outs.logits, labels)
                acc = (preds == labels).sum().item() / len(labels)

                val_loss.update(loss.item(), len(labels))
                val_acc.update(acc, len(labels))

            if val_acc.avg > best_val_acc:
                print(
                    f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.."
                )
                torch.save(model.state_dict(), f"{save_dir}/best.pth")
                best_val_acc = val_acc.avg
                best_val_loss = min(best_val_loss, val_loss.avg)

            print(
                f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || "
                f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}"
            )
            logger.add_scalar("Val/loss", val_loss.avg, epoch)
            logger.add_scalar("Val/accuracy", val_acc.avg, epoch)
            neptune.log_metric(f'Val_loss', val_loss.avg)
            neptune.log_metric(f'Val_avg', val_acc.avg)

            print()
def run_KoBERT():
    torch.multiprocessing.freeze_support()

    device = None
    # GPU 사용 시
    if torch.cuda.is_available():
        print("GPU 사용...")
        device = torch.device("cuda")
    # CPU 사용 시
    else:
        print("CPU 사용...")
        device = torch.device("cpu")

    bertmodel, vocab = get_pytorch_kobert_model()

    # Train, Test 텍스트 데이터 로드
    dataset_train = nlp.data.TSVDataset("txt/alertMsg_train_top10.txt", field_indices=[1, 2], num_discard_samples=1)
    dataset_test = nlp.data.TSVDataset("txt/alertMsg_test_top10.txt", field_indices=[1, 2], num_discard_samples=1)

    # 기본 Bert Tokenizer 사용
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    # Setting Hyper Parameters
    max_len = 64        # 한 문장의 최대 토큰 수: 64
    batch_size = 64     # batch 크기 : 64
    warmup_ratio = 0.1
    num_epochs = 2      # epoch 수: 5
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5

    data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

    model = BERTClassifier(bertmodel, dr_rate=0.1).to(device)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

    # GPU가 여러개이면 torch.nn.DataParallel 실행
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        print("DataParallel 사용...")
        model = DataParallel(model)

    for e in range(num_epochs):
        train_acc = 0.0
        test_acc = 0.0
        model.train()
        # torch.cuda.empty_cache()
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(e + 1, batch_id + 1, loss.data.cpu().numpy(),
                                                                         train_acc / (batch_id + 1)))
        print("epoch {} train acc {}".format(e + 1, train_acc / (batch_id + 1)))

        # torch.save(model, "model_covid-classification.pt")
        torch.save(model.module.state_dict(), "model/kobert_model-classification_state-dict8.pt")
        # torch.cuda.empty_cache()
        model.eval()
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            test_acc += calc_accuracy(out, label)
        print("epoch {} test acc {}".format(e + 1, test_acc / (batch_id + 1)))
示例#20
0
  def train(self, training_data, model, optimizer, scheduler=None, tb=None, epochs=20, log_interval=100, checkpoint_interval=10000):
    
    curr_epoch, model, optimizer, scheduler = self.from_checkpoint_if_exists(model, optimizer, scheduler)
    model.train()
    # ignore_index = model.action_transformer.trg_pad_idx
    eta = 0.95
    iterations = 0

    for epoch in range(curr_epoch, epochs):
      total_mle_loss = 0.0
      n_char_total = 0.0
      n_char_correct = 0.0
      all_rewards = []
      all_value_losses = []

      optimizer = AdamW(model.parameters(), lr=6e-4)

      if self.use_mle or self.use_rl:
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=40000, num_training_steps=len(training_data))
      
      for batch_idx, batch in enumerate(tqdm(training_data, mininterval=2, leave=False)):
        batch_qs, batch_as = map(lambda x: x.to(self.device), batch)
        optimizer.zero_grad()

        if not self.use_mle:
          # policy_losses, value_losses, batch_rewards = self.policy_batch_loss(batch_qs, batch_as, model, gamma=0.9)
          policy_losses, batch_rewards = self.policy_batch_loss(batch_qs, batch_as, model, gamma=0.9)

        if not self.use_rl:
          mle_loss, n_correct, n_char = self.mle_batch_loss(batch_qs, batch_as, model.action_transformer)

        if self.use_mle:
          loss = mle_loss
        elif self.use_rl:
          # loss = policy_losses + value_losses
          loss = policy_losses
        else:
          # eta linear decay
          eta_ld = eta - eta * (iterations / (float(len(training_data) * epochs) / 2))
          # loss = (1-eta_ld)* (policy_losses + value_losses) + eta_ld*mle_loss
          loss = (1 - eta_ld) * policy_losses + eta_ld * mle_loss
          iterations += batch_qs.shape[0]

        loss.backward()
        # clipping gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        if scheduler and self.use_mle:
            scheduler.step()
        
        if not self.use_rl:
          n_char_total += n_char
          n_char_correct += n_correct
          total_mle_loss += mle_loss
        if not self.use_mle:
          all_rewards.append(batch_rewards.cpu().numpy())
          # all_value_losses.append(value_losses)

        if tb is not None and batch_idx % log_interval == 0:
          if self.use_mle:
            self.tb_mle_batch(tb, total_mle_loss, n_char_total, n_char_correct, epoch, batch_idx, len(training_data))
          elif self.use_rl:
            self.tb_policy_batch(tb, batch_rewards, value_losses, epoch, batch_idx, len(training_data))
          else:
            # self.tb_mle_policy_batch(tb, total_mle_loss, n_char_total, n_char_correct, batch_rewards, value_losses, epoch, batch_idx, len(training_data))
            self.tb_mle_policy_batch(tb, total_mle_loss, n_char_total, n_char_correct, batch_rewards, epoch, batch_idx, len(training_data))
      if batch_idx != 0 and epoch % checkpoint_interval == 0:
        self.save_checkpoint(epoch, model, optimizer, scheduler, suffix=str(epoch) + "-ml_rle")
      
      print("average rewards " + str(all_rewards))  
      loss_per_char = total_mle_loss / n_char_total
      accuracy = n_char_correct / n_char_total

      if self.use_rl:
        average_rewards = np.mean(all_rewards)
        # average_value_loss = np.mean(all_value_losses)
      
      if tb is not None:
        if self.use_mle:
          self.tb_mle_epoch(tb, loss_per_char, accuracy, epoch)
        elif self.use_rl:
          self.tb_policy_epoch(tb, average_rewards, average_value_loss, epoch)
        else:
          # self.tb_mle_policy_epoch(tb, loss_per_char, accuracy, average_rewards, average_value_loss, epoch)
          self.tb_mle_policy_epoch(tb, loss_per_char, accuracy, average_rewards, epoch)
示例#21
0
def train(_type, config, load='tmp_vocab.pt'):
    dev_id = 0
    device = torch.device(dev_id)
    config['g2t']['device'] = device
    config['t2g']['device'] = device
    pool, vocab = prep_data(config['main'], load=load)
    model_g2t, model_t2g = prep_model(config, vocab)
    model_g2t.to(device)
    model_t2g.to(device)

    optimizerG2T = torch.optim.Adam(model_g2t.parameters(), lr=config['g2t']['lr'], weight_decay=config['g2t']['weight_decay']) 
    schedulerG2T = get_cosine_schedule_with_warmup(
		optimizer = optimizerG2T, 
		num_warmup_steps = 400, 
		num_training_steps = 800 * config['main']['epoch'], 
	)
    optimizerT2G = torch.optim.Adam(model_t2g.parameters(), lr = config['t2g']['lr'], weight_decay=config['t2g']['weight_decay'])
    schedulerT2G = get_cosine_schedule_with_warmup(
		optimizer = optimizerT2G, 
		num_warmup_steps = 400, 
		num_training_steps = 800 * config['main']['epoch'], 
	)
    loss_t2g, loss_g2t = [], []
    best_g2t, best_t2g = 0., 0.
    
    t2g_weight = [vocab['relation'].wf.get(x, 0) for x in vocab['relation'].i2s]
    t2g_weight[0] = 0
    max_w = max(t2g_weight)
    t2g_weight = np.array(t2g_weight).astype('float32')
    t2g_weight = (max_w + 1000) / (t2g_weight + 1000)

    for i in range(0, config['main']['epoch']):
        _data_g2t = list(pool.draw_with_type(config['main']['batch_size'], True, _type + '_g2t'))
        _data_t2g = list(pool.draw_with_type(config['main']['batch_size'], True, _type + '_t2g'))

        data_list = list(zip(_data_g2t, _data_t2g))
        _data = data_list
        with tqdm.tqdm(_data, disable=True if not config['main']['display'] else False) as tqb:
            for j, (batch_g2t, batch_t2g) in enumerate(tqb):
                if i < config['main']['pre_epoch'] and config['main']['mode'] == 'warm_unsup':
                    _loss1, _loss2 = warmup_step1(
                        batch_g2t,
                        batch_t2g,
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G,
                        config,
                        t2g_weight,
                        vocab
                    )
                if i == config['main']['pre_epoch'] + 1 and config['main']['mode'] == 'warm_unsup':
                    _loss1, _loss2 = warmup_step2(
                        batch_g2t, 
                        batch_t2g, 
                        model_g2t, 
                        model_t2g,
                        optimizerG2T, 
                        optimizerT2G, 
                        config, 
                        t2g_weight, 
                        vocab
                    )
                if config['main']['mode'] == 'sup':
                    _loss1, _loss2 = supervise(
                        batch_g2t, 
                        batch_t2g, 
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G, 
                        config, 
                        t2g_weight, 
                        vocab
                    )
                if (i >= config['main']['pre_epoch'] + 1 and config['main']['mode'] == 'warm_unsup') or (config['main']['mode'] == 'cold_unsup'):
                    _loss1, _loss2 = back_translation(
                        batch_g2t, 
                        batch_t2g, 
                        model_g2t, 
                        model_t2g, 
                        optimizerG2T, 
                        optimizerT2G, 
                        config, 
                        t2g_weight, 
                        vocab
                    )
                loss_t2g.append(_loss1)
                schedulerT2G.step()
                loss_g2t.append(_loss2)
                schedulerG2T.step()
                tqb.set_postfix({'t2g loss': np.mean(loss_t2g), 'g2t loss': np.mean(loss_g2t)})

        logging.info('Epoch '+str(i))
        if i % 5 == 0:
            if i < config['main']['pre_epoch'] and config['main']['mode'] == 'warm_unsup':
                model_g2t.blind, model_t2g.blind = True, True
            else:
                model_g2t.blind, model_t2g.blind = False, False
            if model_t2g.blind:
                e = eval_t2g(pool, 'dev_t2g_blind', vocab, model_t2g, config['t2g'], display=config['main']['display'])
            else:
                e = eval_t2g(pool, 'dev', vocab, model_t2g, config['t2g'], display=config['main']['display'])
            if e > best_t2g:
                best_t2g = max(best_t2g, e)
                torch.save(model_t2g.state_dict(), config['t2g']['save'] + 'X' + 'best')
            e = eval_g2t(pool, 'dev', vocab, model_g2t, config['g2t'], display=config['main']['display'])
            if e > best_g2t:
                best_g2t = max(best_g2t, e)
                torch.save(model_g2t.state_dict(), config['g2t']['save']+'X'+'best')
            if i == config['main']['pre_epoch']:
                torch.save(model_t2g.state_dict(), config['t2g']['save']+'X'+'mid')
                torch.save(model_g2t.state_dict(), config['g2t']['save']+'X'+'mid')
    model_g2t.load_state_dict(torch.load(config['g2t']['save']+'X'+'best'))
    model_t2g.load_state_dict(torch.load(config['t2g']['save']+'X'+'best'))
    logging.info('Final Test mode {0:}'.format(config['main']['mode']))
    e = eval_t2g(pool, 'test', vocab, model_t2g, config['t2g'])
    e = eval_g2t(pool, 'test', vocab, model_g2t, config['g2t'])
示例#22
0
def train():

    seed_everything(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # setting model hyperparameter
    # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다

    # bert_config = BertConfig.from_pretrained(MODEL_NAME)
    # bert_config.num_labels = 42
    # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config)

    # Auto
    model_config = XLMRobertaConfig.from_pretrained(args.model_name)
    model_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        args.model_name, config=model_config)

    # load model and tokenizer
    # MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
    # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    # label = dataset['label'].values

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=0.2,
                                                  random_state=args.seed)
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_val = tokenized_dataset(val_dataset, tokenizer)

    tokenized_train_label = train_dataset['label'].values
    tokenized_val_label = val_dataset['label'].values

    # train_datasets = TokenDataset(train_dataset, tokenizer)
    # val_datasets = TokenDataset(val_dataset, tokenizer)
    RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label)
    RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label)

    # print(model.parameters)
    model.to(device)
    model = torch.nn.DataParallel(model)

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        pin_memory=torch.cuda.is_available(),
        shuffle=True,
    )
    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        shuffle=False,
        pin_memory=torch.cuda.is_available(),
    )

    optimizer = AdamW(model.parameters(),
                      lr=args.lr,
                      weight_decay=args.weight_decay)
    loss_fn = LabelSmoothingLoss(smoothing=0.5)
    # loss_fn = nn.CrossEntropyLoss()

    # t_total = len(train_loader) * args.epoch
    t_total = args.epoch
    warmup_step = int(t_total * args.warmup_steps)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_step,
                                                num_training_steps=t_total)

    log_dir = ""
    log_list = glob("./logs/*")
    if len(log_list) == 0:
        log_dir = "./logs/exp1"
    else:
        log_list = [int(log[-1]) for log in log_list]
        log_dir = "./logs/exp" + str(max(log_list) + 1)

    logger = SummaryWriter(log_dir=log_dir)

    scaler = GradScaler()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    import time

    for epoch in tqdm(range(args.epoch)):
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0
        best_acc = 0.0
        model.train()
        for batch_id, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)

            # loss.backward()
            # optimizer.step()

            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            scaler.step(optimizer)
            scaler.update()

            train_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
            train_loss += loss

            if (batch_id + 1) % args.logging_steps == 0:
                train_loss = train_loss.data.cpu().numpy()
                print(
                    f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}"
                )
                logger.add_scalar("Train/loss",
                                  train_loss / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                logger.add_scalar("Train/acc", train_acc / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                train_acc = 0.0
                train_loss = 0.0

        # scheduler.step()

        print("\nStart Validation Step!")
        with torch.no_grad():
            model.eval()
            for batch_id, batch in enumerate(tqdm(val_loader)):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)
                val_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
                val_loss += loss

            print(
                f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}"
            )
            logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch)
            logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch)

            if val_acc >= best_acc:
                best_acc = val_acc
                # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth"))
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "best.pth"))
                print("Saved best acc model...")

        scheduler.step()

    torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
示例#23
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriterP(args.output_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if p.requires_grad and not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if p.requires_grad and any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    warmup_steps = args.warmup_samples // args.train_batch_size
    if args.lr_decay:
        scheduler = tfopt.get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, t_total)
    else:
        scheduler = tfopt.get_constant_schedule(optimizer)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    try:
        with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c:
            global_step = int(c.readline())
    except OSError as e:
        global_step = 0

    tr_loss, logging_loss = 0.0, 0.0
    moving_loss = MovingLoss(10000 // args.logging_steps)
    model.zero_grad()

    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    try:
        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader,
                                  desc="Iteration",
                                  disable=args.local_rank not in [-1, 0])
            for step, batch in enumerate(epoch_iterator):
                inputs, labels = mask_tokens(
                    batch, tokenizer, args) if args.mlm else (batch, batch)
                inputs = inputs.to(args.device)
                labels = labels.to(args.device)
                model.train()
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                loss = outputs[
                    0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                moving_loss.add(loss.item())
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training and global_step % args.eval_steps == 0:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer,
                                           f"checkpoint-{global_step}")
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)

                    if args.local_rank in [
                            -1, 0
                    ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        logging_loss = tr_loss
                        epoch_iterator.set_postfix(
                            MovingLoss=f'{moving_loss.loss:.2f}',
                            Perplexity=
                            f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}')

                    if args.local_rank in [
                            -1, 0
                    ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        save_state(args, model, tokenizer, global_step)

                if args.max_steps > 0 and global_step > args.max_steps:
                    epoch_iterator.close()
                    break
            print_sample(model, tokenizer, args.device, args)
            if args.max_steps > 0 and global_step > args.max_steps:
                train_iterator.close()
                break
    except (KeyboardInterrupt, SystemExit):
        save_state(args, model, tokenizer, global_step)
        raise

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  
# 모델 학습 시작
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
示例#25
0
def train(**kwargs):
    global id_map, label_map

    if kwargs["dryrun"]:
        os.environ["WANDB_MODE"] = "dryrun"

    wandb.init(project="APAuT", name=kwargs["name"])

    train_texts, train_labels = read_mgb(
        kwargs["train_set"],
        kwargs["train_set_start_index"],
        kwargs["train_set_length"],
        kwargs["train_set_max_words"],
    )
    test_texts, test_labels = read_mgb(kwargs["test_set"])

    label_types = np.unique(np.array(flatten(train_labels)))
    print(label_types)
    encode = lambda l: [(np.where(label_types == item)[0][0] + 1)
                        for item in l]
    encode_all = lambda l: [encode(item) for item in l]
    join_all = lambda l: [" ".join(item) for item in l]

    train_labels = encode_all(train_labels)
    test_labels = encode_all(test_labels)
    train_texts = join_all(train_texts)
    test_texts = join_all(test_texts)

    tokenizer = AutoTokenizer.from_pretrained(kwargs["model"], use_fast=True)

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding=True,
        pad_to_multiple_of=kwargs["pad_length"],
    )
    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding=True,
        pad_to_multiple_of=kwargs["pad_length"],
    )

    if len(train_encodings["input_ids"][0]) != len(
            test_encodings["input_ids"][0]):
        raise ValueError(f"""
                          train length with padding is {len(train_encodings['input_ids'][0])} 
                          while test length is {len(test_encodings['input_ids'][0])}
                          """)
    else:
        length = len(train_encodings["input_ids"][0])

    train_dataset = MGBDataset(train_encodings, train_labels, length)
    test_dataset = MGBDataset(test_encodings, test_labels, length)

    kwargs["effective_batch_size"] = (kwargs["gradient_accumulation_steps"] *
                                      kwargs["batch_size"])

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=kwargs["epochs"],
        per_device_train_batch_size=kwargs["batch_size"],
        per_device_eval_batch_size=kwargs["batch_size"],
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=100,  # 50_000 // kwargs["effective_batch_size"],
        gradient_accumulation_steps=kwargs["gradient_accumulation_steps"],
        fp16=kwargs["fp16"],
    )

    label_map = {i + 1: label for i, label in enumerate(label_types)}
    label_map[0] = "<pad>"
    id_map = {label: i + 1 for i, label in enumerate(label_types)}
    id_map["<pad>"] = 0

    config = AutoConfig.from_pretrained(
        kwargs["model"],
        num_labels=len(label_types),
        id2label=label_map,
        label2id=id_map,
    )

    model = AutoModelForTokenClassification.from_pretrained(kwargs["model"],
                                                            config=config)

    optimizer = AdamW(
        [
            {
                "params": model.base_model.parameters()
            },
            {
                "params": model.classifier.parameters()
            },  #'lr': 1e-3}
        ],
        lr=kwargs["learning_rate"],
        weight_decay=kwargs["weight_decay"],
    )

    total_steps = len(train_dataset) // kwargs["effective_batch_size"]
    total_steps = total_steps * kwargs["epochs"]
    schedule = get_cosine_schedule_with_warmup(optimizer,
                                               kwargs["warmup_steps"],
                                               total_steps)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, schedule),
    )

    wandb.config.update(kwargs)
    trainer.train()

    if kwargs["save"]:
        model_path = f'./models/{kwargs["name"]}'
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

    trainer.evaluate()
示例#26
0
 def __new__(cls, optimizer, *args, **kwargs):
     return get_cosine_schedule_with_warmup(optimizer, *args, **kwargs)
示例#27
0
def train(_type, config, load="tmp_vocab.pt"):
    dev_id = 0
    device = torch.device(
        dev_id) if torch.cuda.is_available() else torch.device('cpu')
    config["g2t"]["device"] = device
    config["t2g"]["device"] = device
    pool, vocab = prep_data(config["main"], load=load)
    model_g2t, model_t2g = prep_model(config, vocab)
    model_g2t.to(device)
    model_t2g.to(device)

    from transformers.optimization import (
        get_cosine_schedule_with_warmup,
        get_linear_schedule_with_warmup,
    )

    optimizerG2T = torch.optim.Adam(
        model_g2t.parameters(),
        lr=config["g2t"]["lr"],
        weight_decay=config["g2t"]["weight_decay"],
    )
    schedulerG2T = get_cosine_schedule_with_warmup(
        optimizer=optimizerG2T,
        num_warmup_steps=400,
        num_training_steps=800 * config["main"]["epoch"],
    )
    optimizerT2G = torch.optim.Adam(
        model_t2g.parameters(),
        lr=config["t2g"]["lr"],
        weight_decay=config["t2g"]["weight_decay"],
    )
    schedulerT2G = get_cosine_schedule_with_warmup(
        optimizer=optimizerT2G,
        num_warmup_steps=400,
        num_training_steps=800 * config["main"]["epoch"],
    )
    loss_t2g, loss_g2t = [], []
    best_g2t, best_t2g = 0.0, 0.0
    klds = []

    t2g_weight = [
        vocab["relation"].wf.get(x, 0) for x in vocab["relation"].i2s
    ]
    t2g_weight[0] = 0
    max_w = max(t2g_weight)
    t2g_weight = np.array(t2g_weight).astype("float32")
    t2g_weight = (max_w + 1000) / (t2g_weight + 1000)

    for i in range(0, config["main"]["epoch"]):
        _data_g2t = list(
            pool.draw_with_type(config["main"]["batch_size"], True,
                                _type + "_g2t"))
        _data_t2g = list(
            pool.draw_with_type(config["main"]["batch_size"], True,
                                _type + "_t2g"))

        data_list = list(zip(_data_g2t, _data_t2g))
        _data = data_list
        with tqdm.tqdm(_data,
                       disable=True
                       if not config["main"]["display"] else False) as tqb:
            for j, (batch_g2t, batch_t2g) in enumerate(tqb):
                if (i < config["main"]["pre_epoch"]
                        and config["main"]["mode"] == "warm_unsup"):
                    _loss1, _loss2, kld = warmup_step1(
                        batch_g2t,
                        batch_t2g,
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G,
                        config,
                        t2g_weight,
                        vocab,
                    )
                if (i == config["main"]["pre_epoch"] + 1
                        and config["main"]["mode"] == "warm_unsup"):
                    _loss1, _loss2, kld = warmup_step2(
                        batch_g2t,
                        batch_t2g,
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G,
                        config,
                        t2g_weight,
                        vocab,
                    )
                if config["main"]["mode"] == "sup":
                    _loss1, _loss2, kld = supervise(
                        batch_g2t,
                        batch_t2g,
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G,
                        config,
                        t2g_weight,
                        vocab,
                    )
                if (i >= config["main"]["pre_epoch"] + 1
                        and config["main"]["mode"] == "warm_unsup") or (
                            config["main"]["mode"] == "cold_unsup"):
                    _loss1, _loss2, kld = back_translation(
                        batch_g2t,
                        batch_t2g,
                        model_g2t,
                        model_t2g,
                        optimizerG2T,
                        optimizerT2G,
                        config,
                        t2g_weight,
                        vocab,
                    )
                loss_t2g.append(_loss1)
                schedulerT2G.step()
                loss_g2t.append(_loss2)
                schedulerG2T.step()
                klds.append(kld)
                tqb.set_postfix({
                    "t2g loss": np.mean(loss_t2g),
                    "g2t loss": np.mean(loss_g2t),
                    "kld loss": np.mean(klds),
                })

        logging.info("Epoch " + str(i))
        if i % 1 == 0:
            if (i < config["main"]["pre_epoch"]
                    and config["main"]["mode"] == "warm_unsup"):
                model_g2t.blind, model_t2g.blind = True, True
            else:
                model_g2t.blind, model_t2g.blind = False, False
            if model_t2g.blind:
                e = eval_t2g(
                    pool,
                    "dev_t2g_blind",
                    vocab,
                    model_t2g,
                    config["t2g"],
                    display=config["main"]["display"],
                )
            else:
                e = eval_t2g(
                    pool,
                    "dev",
                    vocab,
                    model_t2g,
                    config["t2g"],
                    display=config["main"]["display"],
                )
            if e > best_t2g:
                best_t2g = max(best_t2g, e)
                torch.save(model_t2g.state_dict(),
                           config["t2g"]["save"] + "X" + "best")
            e = eval_g2t(
                pool,
                "dev",
                vocab,
                model_g2t,
                config["g2t"],
                display=config["main"]["display"],
            )
            if e > best_g2t:
                best_g2t = max(best_g2t, e)
                torch.save(model_g2t.state_dict(),
                           config["g2t"]["save"] + "X" + "best")
            if i == config["main"]["pre_epoch"]:
                torch.save(model_t2g.state_dict(),
                           config["t2g"]["save"] + "X" + "mid")
                torch.save(model_g2t.state_dict(),
                           config["g2t"]["save"] + "X" + "mid")
    model_g2t.load_state_dict(torch.load(config["g2t"]["save"] + "X" + "best"))
    model_t2g.load_state_dict(torch.load(config["t2g"]["save"] + "X" + "best"))
    logging.info("Final Test mode {0:}".format(config["main"]["mode"]))
    e = eval_t2g(pool, "test", vocab, model_t2g, config["t2g"])
    e = eval_g2t(pool, "test", vocab, model_g2t, config["g2t"])