def finetune_model(args, model, loader): optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) def update(engine, batch): model.train() batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) input_ids, lm_labels, token_type_ids, nodes_ids, attention_mask = batch if (not args.graph and not args.edge_list): nodes_ids = None if (not args.unilm): attention_mask = None (lm_loss), *_ = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=lm_labels, nodes=nodes_ids, attention_mask=attention_mask) loss = lm_loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) input_ids, lm_labels, token_type_ids, nodes_ids, attention_mask = batch if (not args.graph and not args.edge_list): nodes_ids = None if (not args.unilm): attention_mask = None # if we dont send labels to model, it doesnt return losses lm_logits, *_ = model(input_ids=input_ids, token_type_ids=token_type_ids, nodes=nodes_ids, attention_mask=attention_mask) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, ) trainer = Engine(update) evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(loader)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) trainer.run(loader, max_epochs=args.n_epochs) return model
def train(): config_file = "configs/train_daily_dialog_multihead_config.json" config = Config.from_json_file(config_file) ec_coef = 1 sc_coef = 1 # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = OpenAIGPTMultiHeadModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Training function and trainer def update(engine, batch): model.train() #input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple( input_tensor.to(config.device) for input_tensor in batch) lm_loss, emotion_loss, sentence_loss = model( input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids) loss = (lm_loss * config.lm_coef + emotion_loss * ec_coef + sentence_loss * sc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, \ sc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, ec_token_ids, sc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids, token_action_ids=token_action_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 2] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, sc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) tb_logger.close()
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n' + json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders( tb, vb) # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb) weights = (1 / num_of_images) / ((1 / num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained( 'efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics # class SoftCrossEntropyLoss(nn.Module): # def __init__(self, weight=None): # super(SoftCrossEntropyLoss, self).__init__() # self.class_weights = weight # def forward(self, input, target): # softmax = torch.exp(input) / torch.exp(input).sum(1)[:, None] # onehot_labels = to_onehot(target, input.shape[1]) # soft_labels = torch.zeros_like(onehot_labels) # soft_labels = torch.where(onehot_labels.cpu() == 1, torch.tensor([0.9]), torch.tensor([0.1/(input.shape[1]-1)])).to(device=device) # if self.class_weights is not None: # # print(soft_labels.shape, softmax.shape) # loss = -torch.sum(torch.log(softmax) * soft_labels * self.class_weights * input.shape[1]) # else: # loss = -torch.sum(torch.log(softmax) * soft_labels) # return loss class LabelMSELoss(nn.Module): def __init__(self, weight=None): super(LabelMSELoss, self).__init__() self.class_weights = weight.to(device=device) def forward(self, input, target): target_onehot = to_onehot( target, num_classes=input.shape[1]).to(device=device) mse = (input - target_onehot)**2 if self.class_weights is not None: weights = self.class_weights[target] * input.shape[1] return (mse.sum(1) * weights).sum() else: return mse.sum() class MixupLoss(nn.Module): def __init__(self, weight=None): super(MixupLoss, self).__init__() self.class_weights = weight.to(device=device) def forward(self, input, target): mse = LabelMSELoss(weight=self.class_weights) ce = nn.CrossEntropyLoss(weight=self.class_weights) return mse(input, target) + ce(input, target) class EntropyPrediction(metric.Metric): def __init__(self, threshold=0.5): super(EntropyPrediction, self).__init__() self.threshold = threshold self.prediction = torch.tensor([], dtype=torch.int) self.y = torch.tensor([], dtype=torch.int) def reset(self): # self.threshold = 0.5 self.prediction = torch.tensor([]) self.y = torch.tensor([]) super(EntropyPrediction, self).reset() def update(self, output): y_pred, y = output softmax = torch.exp(y_pred) / torch.exp(y_pred).sum(1)[:, None] entropy_base = math.log(y_pred.shape[1]) entropy = (-softmax * torch.log(softmax)).sum(1) / entropy_base values, inds = softmax.max(1) prediction = torch.where(entropy < self.threshold, inds, torch.tensor([-1]).to(device=device)) self.prediction = torch.cat( (self.prediction.type(torch.LongTensor).to(device=device), torch.tensor([mapping[x.item()] for x in prediction]).to(device=device))) self.y = torch.cat( (self.y.type(torch.LongTensor).to(device=device), y.to(device=device))) # return self.prediction, self.y def compute(self): return self.prediction, self.y train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(MixupLoss(weight=weights)) + Loss(nn.CrossEntropyLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } val_metrics = { 'accuracy': MetricsLambda(Labels2Acc, EntropyPrediction(1.0)), 'precision_recall': MetricsLambda(Labels2PrecisionRecall, EntropyPrediction(1.0), val_loader.dataset.classes), 'cmatrix': MetricsLambda(Labels2CMatrix, EntropyPrediction(1.0), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, MixupLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_STARTED) def refresh_pbar(engine): pbar.refresh() pbar.n = pbar.last_print_n = 0 # Compute metrics on train data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): print('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, avg_loss, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # Compute metrics on val data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.clear() print('* - * - * - * - * - * - * - * - * - * - * - * - *') print('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Validating Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars( 'Aggregate/Score', { 'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1] }, engine.state.epoch) # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=10, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def create_fbeta(): return MetricsLambda(fbeta, Recall(average=True), Precision(average=True), 0.5, True)
def train(): parser = ArgumentParser() parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO) logger.info("Arguments: %s", pformat(args)) logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer tokenizer = tokenizer_class.from_pretrained("gpt2") model_class = GPT2DoubleHeadsModel model = model_class.from_pretrained("gpt2") model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) ### TODO add our own special tokens optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) ### TODO load data ourselves # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir("gpt2") tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def test_metrics_lambda_update(): """ Test if the underlying metrics are updated """ y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() precision = Precision(average=False) recall = Recall(average=False) def Fbeta(r, p, beta): return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) F1.update((y_pred, y)) assert precision._updated assert recall._updated F1.reset() assert not precision._updated assert not recall._updated """ Test multiple updates and if the inputs of the underlying metrics are updated multiple times """ y_pred1 = torch.randint(0, 2, size=(15,)) y1 = torch.randint(0, 2, size=(15,)) y_pred2 = torch.randint(0, 2, size=(15,)) y2 = torch.randint(0, 2, size=(15,)) F1.update((y_pred1, y1)) F1.update((y_pred2, y2)) # Compute true_positives and positives for precision correct1 = y1 * y_pred1 all_positives1 = y_pred1.sum(dim=0) if correct1.sum() == 0: true_positives1 = torch.zeros_like(all_positives1) else: true_positives1 = correct1.sum(dim=0) correct2 = y2 * y_pred2 all_positives2 = y_pred2.sum(dim=0) if correct2.sum() == 0: true_positives2 = torch.zeros_like(all_positives2) else: true_positives2 = correct2.sum(dim=0) true_positives = true_positives1 + true_positives2 positives = all_positives1 + all_positives2 assert precision._type == "binary" assert precision._true_positives == true_positives assert precision._positives == positives # Computing positivies for recall is different positives1 = y1.sum(dim=0) positives2 = y2.sum(dim=0) positives = positives1 + positives2 assert recall._type == "binary" assert recall._true_positives == true_positives assert recall._positives == positives """ Test compute """ F1.reset() F1.update((y_pred1, y1)) F1_metrics_lambda = F1.compute() F1_sklearn = f1_score(y1.numpy(), y_pred1.numpy()) assert pytest.approx(F1_metrics_lambda) == F1_sklearn
def wrapper(*args, **kwargs): return MetricsLambda(fn, self, *args, **kwargs)
def __getitem__(self, index): from ignite.metrics import MetricsLambda return MetricsLambda(lambda x: x[index], self)
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default=DATA_FOLDER, help="Path of the dataset.") parser.add_argument("--image_path", type=str, default=IMG_FOLDER, help="Path of the images.") parser.add_argument("--images_feature_path", type=str, default=IMG_FEATURE_FOLDER, help="Path of the images.") parser.add_argument("--dataset_cache", type=str, default=DATA_CACHE, help="Path of the dataset cache_no_pretrained") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument('--dhead_gpt2', action='store_true', default=False, help="use double head gpt2") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', default=True, help="If False train from scratch") parser.add_argument("--num_candidates", type=int, default=1, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=3, help="Number of previous turns to keep in history") parser.add_argument("--max_length", type=int, default=256, help="Max length of input sentence") parser.add_argument("--train_batch_size", type=int, default=58, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=9, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--scheduler", type=str, default="linear", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=50, help="Number of training epochs") parser.add_argument("--num_workers", type=int, default=0, help="Number of subprocesses for data loading") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="O1", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = BertTokenizer config_class = GPT2Config # GPT2Config if "gpt2" in args.model_checkpoint else OpenAIGPTConfig model_class = GPT2LMHeadModel # GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel if args.pretrained: tokenizer = tokenizer_class.from_pretrained(MODEL_CHECKPOINT, do_lower_case=False) # tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=True) model = model_class.from_pretrained(MODEL_CHECKPOINT) else: tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=False) tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) config = config_class.from_json_file(CONFIG_PATH) model = model_class(config) model.to(args.device) # Add special tokens if they are not already added # add_special_tokens_(model, tokenizer) # optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = build_dataloader(args, tokenizer, logger) def update(engine, batch): model.train() batch = tuple(torch.tensor(input_data).to(args.device) if idx not in [2, 3] else input_data for idx, input_data in enumerate(batch)) input_ids, token_type_ids, input_images, image_ids, lm_labels, mc_token_ids, mc_labels = batch if args.dhead_gpt2: (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps else: (lm_loss), *_ = model(input_ids, labels=lm_labels, token_type_ids=token_type_ids, input_images=input_images, image_ids=image_ids) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() #, optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses if args.dhead_gpt2: lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) else: lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch # trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero model_size = args.n_emd noam_lambda = lambda step: ( model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', n_saved=None) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def main(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="", help="Model type, one of: %s" % ', '.join(MODELS.keys())) parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of a pretrained model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--adv_coef", type=float, default=1.0, help="Adversarial dataset prediction loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") #parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") #parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument( "--max_sequence_length", type=int, default=-1, help="If set, use this to manually restrict the sequence length. " "This might be helpful to save resources (memory). " "If not set, this is looked up from the model config (n_ctx value).") parser.add_argument( "--adversarial_dataset_prediction", action='store_true', help="Set to train with adversarial dataset prediction") parser.add_argument("--seed", type=int, default=None, help='set random seed') args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) if args.seed is not None: torch.manual_seed(args.seed) args.distributed = (args.local_rank != -1) logger.info("Prepare tokenizer and data") if not args.model: logger.warning( '"model" parameter is not set! This is deprecated. Please use one of: %s. ' 'To mimic deprecated behaviour, "model_checkpoint" will be used as "model"' % ', '.join(MODELS.keys())) args.model = args.model_checkpoint if args.model not in MODELS: raise NotImplementedError( 'model "%s" not implemented. use one of: %s' % (args.model, ', '.join(MODELS.keys()))) config_class, tokenizer_class, model_class, _ = MODELS[args.model] if not args.model_checkpoint: args.model_checkpoint = args.model model_config = config_class.from_pretrained(args.model_checkpoint) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) additional_special_tokens = [TYPE_BACKGROUND, TYPE_BOT, TYPE_USER] # for adversarial training (dataset prediction) dataset_labels = None if args.adversarial_dataset_prediction: dataset_labels = [ get_dataset_label(dataset_path) for dataset_path in args.dataset_path.split(',') ] #additional_special_tokens.extend(dataset_labels) #if model_class not in ADV_MODELS.values(): assert model_class in ADV_MODELS, f'no adversarial model implemented for model class: {model_class.__name__}' model_class = ADV_MODELS[model_class] if not hasattr(model_config, 'cls'): model_config.cls = {} if 'dataset_labels' in model_config.cls: assert all([dl in model_config.cls['dataset_labels']['labels'] for dl in dataset_labels]), \ f'loaded dataset_labels [{model_config.cls["dataset_labels"]["labels"]}] do not contain all ' \ f'current dataset_labels [{dataset_labels}]' dataset_labels = model_config.cls['dataset_labels']['labels'] else: model_config.cls['dataset_labels'] = { 'labels': dataset_labels, 'is_adversarial': True } model_input_names = [ "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "dataset_labels", "token_type_ids" ] # not yet used model_output_names = [ "lm_loss", "mc_loss", "cl_loss_0", "lm_logits", "mc_logits", "cl_logits_0", "presents" ] else: model_input_names = [ "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids" ] # not yet used model_output_names = [ "lm_loss", "mc_loss", "lm_logits", "mc_logits", "presents" ] tokenizer.add_special_tokens({ 'bos_token': TYPE_BOS, 'eos_token': TYPE_EOS, 'pad_token': TYPE_PAD, 'additional_special_tokens': additional_special_tokens }) logger.info("Prepare datasets") max_sequence_length = model_config.n_ctx if args.max_sequence_length <= 0 else args.max_sequence_length assert max_sequence_length <= model_config.n_ctx, 'max_sequence_length [%i] was set to a value higher than ' \ 'supported by the model (config.n_ctx [%i]). Please use a lower ' \ 'value or do not set it [-1] to use the highest supported one.' \ % (max_sequence_length, model_config.n_ctx) train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args=args, tokenizer=tokenizer, model_input_names=model_input_names, max_sequence_length=max_sequence_length, dataset_labels=dataset_labels) logger.info( "Prepare pretrained model and optimizer - add special tokens for fine-tuning" ) # Initialize distributed training if needed # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab #model = model_class.from_pretrained(args.model_checkpoint, num_cl_labels=len(dataset_ids)) # for GPT2DoubleHeadsModelwithAdversarial model = model_class.from_pretrained(args.model_checkpoint, config=model_config) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab #################################################################################################################### # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] #optimizer = OpenAIAdam(model.parameters(), lr=args.lr) optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) # scheduler is set below (see ignite) #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, # num_training_steps=len(train_loader) // args.train_batch_size + 1) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_checkpoint, 'optimizer.pt')) and os.path.isfile( os.path.join(args.model_checkpoint, 'scheduler.pt')): # Load in optimizer and scheduler states # TODO: this needs to be dumped somewhere optimizer.load_state_dict( torch.load(os.path.join(args.model_checkpoint, 'optimizer.pt'))) #scheduler.load_state_dict(torch.load(os.path.join(args.model_checkpoint, 'scheduler.pt'))) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Training function and trainer def update(engine, batch): model.train() batch = { model_input_names[i]: input_tensor.to(args.device) for i, input_tensor in enumerate(batch) } model_output = model(**batch) losses = model_output[: 3] if args.adversarial_dataset_prediction else model_output[: 2] if args.n_gpu > 1: # mean() to average on multi-gpu. losses = list(losses) for i in range(len(losses)): losses[i] = losses[i].mean() lm_loss, mc_loss = losses[0], losses[1] loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps # handle adversarial loss loss_wo_adv = loss.clone() if args.adversarial_dataset_prediction: adv_loss = model_output[2] loss += (adv_loss * args.adv_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() #scheduler.step() # Update learning rate schedule # already DONE below! optimizer.zero_grad() return loss_wo_adv.item(), loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) if args.adversarial_dataset_prediction: input_ids, mc_token_ids, lm_labels, mc_labels, dataset_labels, token_type_ids = batch else: input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.debug( tokenizer.decode(input_ids[0, -1, :].tolist()).replace( TYPE_PAD, '')) model_outputs = model(input_ids=input_ids, mc_token_ids=mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero (scheduler) scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") if args.adversarial_dataset_prediction: RunningAverage(output_transform=lambda x: x[1]).attach( trainer, "loss_w/_adv") RunningAverage(output_transform=lambda x: x[1] - x[0]).attach( trainer, "loss_only_adv") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) if args.adversarial_dataset_prediction: tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["loss_w/_adv"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["loss_only_adv"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) logger.info('save checkpoints to: %s' % tb_logger.writer.log_dir) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_pretrained(tb_logger.writer.log_dir) #logger.debug("Saving optimizer and scheduler states to %s", tb_logger.writer.log_dir) #torch.save(optimizer.state_dict(), os.path.join(tb_logger.writer.log_dir, 'optimizer.pt')) #torch.save(scheduler.state_dict(), os.path.join(tb_logger.writer.log_dir, 'scheduler.pt')) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument("--model_checkpoint", type=str, default=PRETRAINED_MODEL_URL, help="Path to the pretrained model checkpoint") parser.add_argument("--dataset_path", type=str, default='trec', help="'imdb', 'trec' or a dict of splits paths.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache_fine_tune_trec', help="Path or url of the dataset cache") parser.add_argument("--finetuning_model_class", type=str, default="TransformerWithClfHead", help="Fine-tuning model class for the target task") parser.add_argument( "--num_classes", type=int, default=2, help="Number of classes for the target classification task") parser.add_argument( "--adapters_dim", type=int, default=-1, help="If >0 add adapters to the model wtih adapters_dim dimension") parser.add_argument("--clf_loss_coef", type=float, default=1, help="If >0 add a classification loss") parser.add_argument("--lm_loss_coef", type=float, default=-1, help="If >0 add a language modeling loss") parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation") parser.add_argument("--lr", type=float, default=6e-5, help="Learning rate") parser.add_argument("--n_warmup", type=int, default=500, help="Number of warmup iterations") parser.add_argument("--max_norm", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--eval_every", type=int, default=100, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradient") parser.add_argument("--initializer_range", type=float, default=0.02, help="Normal initialization standard deviation") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat( args)) # This is a logger.info: only printed on the first process # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') # Loading tokenizer, pretrained model and optimizer logger.info("Prepare tokenizer, model and optimizer") tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Let's use a pre-defined tokenizer logger.info("Create model from class %s and configuration %s", args.finetuning_model_class, os.path.join(args.model_checkpoint, CONFIG_NAME)) ModelClass = getattr(importlib.import_module("finetuning_model"), args.finetuning_model_class) pretraining_args = torch.load( cached_path(os.path.join(args.model_checkpoint, CONFIG_NAME))) model = ModelClass(config=pretraining_args, fine_tuning_config=args).to(args.device) logger.info("Load pretrained weigths from %s", os.path.join(args.model_checkpoint, WEIGHTS_NAME)) state_dict = torch.load(cached_path( os.path.join(args.model_checkpoint, WEIGHTS_NAME)), map_location='cpu') incompatible_keys = model.load_state_dict(state_dict, strict=False) logger.info("Parameters discarded from the pretrained model: %s", incompatible_keys.unexpected_keys) logger.info("Parameters added in the adaptation model: %s", incompatible_keys.missing_keys) model.tie_weights() optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") loaders = get_data_loaders(args, tokenizer, pretraining_args.num_max_positions, clf_token=tokenizer.vocab['[CLS]']) train_loader, val_loader, train_sampler, valid_sampler = loaders # Training function and trainer def update(engine, batch): model.train() batch, labels = (t.to(args.device) for t in batch) inputs = batch.transpose( 0, 1).contiguous() # to shape [seq length, batch] _, (clf_loss, lm_loss) = model( inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']), clf_labels=labels, lm_labels=inputs, padding_mask=(batch == tokenizer.vocab['[PAD]'])) loss = (max(0, args.clf_loss_coef) * clf_loss + max( 0, args.lm_loss_coef) * lm_loss) / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch, labels = (t.to(args.device) for t in batch) inputs = batch.transpose( 0, 1).contiguous() # to shape [seq length, batch] _, clf_logits = model( inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']), padding_mask=(batch == tokenizer.vocab['[PAD]'])) return clf_logits, labels evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Learning rate schedule: linearly warm-up to lr and then to zero scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (args.n_warmup, args.lr), (len(train_loader) * args.n_epochs, 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"accuracy": Accuracy()} metrics.update({ "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args, prefix="finetune_") # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n'+json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders(tb, vb) weights = (1/num_of_images)/((1/num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(CrossEntropywithLS(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, CrossEntropywithLS(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=desc.format(0) ) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) # Refresh Process bar. @trainer.on(Events.EPOCH_COMPLETED) def refresh_pbar(engine): print ('Epoch {} completed!'.format(engine.state.epoch)) pbar.refresh() pbar.n = pbar.last_print_n = 0 # Compute metrics on train data on each epoch completed. # cpe = CustomPeriodicEvent(n_epochs=50) # cpe.attach(trainer) # @trainer.on(cpe.Events.EPOCHS_50_COMPLETED) def log_training_results(engine): pbar.refresh() print ('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Id: {} Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(os.environ['run-id'],engine.state.epoch,avg_accuracy,avg_loss,precision_recall['pretty'],cmatrix['pretty']) tqdm.write(prompt) logging.info('\n'+prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # pbar.n = pbar.last_print_n = 0 cpe = CustomPeriodicEvent(n_epochs=50) cpe.attach(trainer) # @trainer.on(cpe.Events.EPOCHS_50_COMPLETED) trainer.add_event_handler(cpe.Events.EPOCHS_50_COMPLETED, log_training_results) trainer.add_event_handler(Events.STARTED, log_training_results) # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=10, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--model_checkpoint", type=str, default="t5-small", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=7, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=10, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=10, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=12, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--save_name", type=str, default="") parser.add_argument("--mask_ratio", type=float, default=0.15) parser.add_argument("--objective", type=str, default="span_denosing", help="response_generation, span_denosing, both") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer = T5Tokenizer.from_pretrained(args.model_checkpoint) model = T5ForConditionalGeneration.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) def collate_fn(data): batch = { "corrupted_context": [], "context": [], "target": [], "response": [] } padded_dataset = {} batch_size = len(data) resp_sos, context_sos = tokenizer.convert_tokens_to_ids([ "<go_r>", "<go_b>", ]) for x in data: corrupted_context = ["fill : "] target = [] length = len(x["context_words"]) mask_bool = random_spans_noise_mask(length=length, noise_density=args.mask_ratio, mean_noise_span_length=3.0) mask_id = 0 #print(mask_bool) for i in range(length): if mask_bool[i]: if i > 0 and mask_bool[i - 1]: target.append(x["context_words"][i]) else: target.append(f"<extra_id_{mask_id}>") target.append(x["context_words"][i]) corrupted_context.append(f"<extra_id_{mask_id}>") mask_id += 1 else: corrupted_context.append(x["context_words"][i]) target.append("<eos_b>") batch["context"].append( tokenizer.encode("response : " + " ".join(x["context_words"]))) batch["corrupted_context"].append( tokenizer.encode(" ".join(corrupted_context))) batch["target"].append(tokenizer.encode(" ".join(target))) batch["response"].append(tokenizer.encode(x["response"])) # print(" ".join(x["context_words"])) # print(" ".join(corrupted_context)) # print(" ".join(target)) # print("") # print(tokenizer.decode(batch["corrupted_context"][-1])) # print(tokenizer.decode(batch["target"][-1])) # print(tokenizer.decode(batch["response"][-1])) # print("") context_ids, context_masks = padInput(batch["context"]) input_ids, masks = padInput(batch["corrupted_context"]) target_ids, target_inputs = padOutput(batch["target"]) response_ids, response_inputs = padOutput(batch["response"]) #inputs padded_dataset["input_ids"] = torch.tensor(input_ids, dtype=torch.long) padded_dataset["masks"] = torch.tensor(masks, dtype=torch.long) padded_dataset["context_ids"] = torch.tensor(context_ids, dtype=torch.long) padded_dataset["context_masks"] = torch.tensor(context_masks, dtype=torch.long) padded_dataset["target_ids"] = torch.tensor(target_ids, dtype=torch.long) padded_dataset["response_ids"] = torch.tensor(response_ids, dtype=torch.long) padded_dataset["target_inputs"] = torch.tensor(np.concatenate((np.ones( (batch_size, 1)) * context_sos, target_inputs[:, :-1]), axis=1), dtype=torch.long) padded_dataset["response_inputs"] = torch.tensor(np.concatenate( (np.ones((batch_size, 1)) * resp_sos, response_inputs[:, :-1]), axis=1), dtype=torch.long) return padded_dataset logger.info("Prepare datasets") train_dataset, valid_dataset, train_sampler, valid_sampler = get_data( args, tokenizer) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed), collate_fn=collate_fn, num_workers=4) val_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4) logger.info("Train dataset length: {}".format(len(train_dataset))) logger.info("Valid dataset length: {}".format(len(valid_dataset))) # for batch in train_loader: # #print(batch) # exit(0) # Training function and trainer def update(engine, batch): model.train() batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) input_ids, masks, context_ids, context_masks, target_ids, target_inputs, response_ids, response_inputs = batch # print("input") # print(tokenizer.decode(input_ids[0, :].tolist())) # print("context_ids") # print(tokenizer.decode(context_ids[0, :].tolist())) # print("target") # print(tokenizer.decode(target_ids[0, :].tolist())) # print("target In") # print(tokenizer.decode(target_inputs[0, :].tolist())) # print("response_ids") # print(tokenizer.decode(response_ids[0, :].tolist())) # print("response_inputs") # print(tokenizer.decode(response_inputs[0, :].tolist())) #exit(0) outputs = model(input_ids, attention_mask=masks, decoder_input_ids=target_inputs, lm_labels=target_ids) context_loss = outputs[0] outputs = model(context_ids, attention_mask=context_masks, decoder_input_ids=response_inputs, lm_labels=response_ids) resp_loss = outputs[0] loss = (context_loss + resp_loss) / args.gradient_accumulation_steps loss = (context_loss) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS) input_ids, masks, context_ids, context_masks, target_ids, target_inputs, response_ids, response_inputs = batch outputs = model( input_ids, attention_mask=masks, decoder_input_ids=target_inputs #, lm_labels=target_ids ) context_logits = outputs[0] outputs = model( context_ids, attention_mask=context_masks, decoder_input_ids=response_inputs, #lm_labels=response_ids ) resp_logits = outputs[0] context_logits_flat_shifted = context_logits.view( -1, context_logits.size(-1)) context_labels_flat_shifted = target_ids.view(-1) resp_logits_flat_shifted = resp_logits.view( -1, resp_logits.size(-1)) resp_labels_flat_shifted = response_ids.view(-1) return (context_logits_flat_shifted, resp_logits_flat_shifted), (context_labels_flat_shifted, resp_labels_flat_shifted) #return (context_logits_flat_shifted, context_logits_flat_shifted), (context_labels_flat_shifted, context_labels_flat_shifted) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # if args.eval_before_start: # trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "span": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "response": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_span": MetricsLambda(average_distributed_scalar, metrics["span"], args), "average_response": MetricsLambda(average_distributed_scalar, metrics["response"], args) }) metrics["average_response"] = MetricsLambda(math.exp, metrics["average_response"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) if not os.path.exists(f"pretrained_model/{args.save_name}"): os.makedirs(f"pretrained_model/{args.save_name}") log_dir = f"pretrained_model/{args.save_name}" tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument('--gpt2', action='store_true', help="use gpt2") parser.add_argument("--model_checkpoint", type=str, default="config/cgpt/", help="Path or URL of the model") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', help="If False train from scratch") parser.add_argument("--data_path", type=str, default="", help="Path or url of the dataset. ") parser.add_argument( "--train_path", type=str, default= "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_train.txt", help="Path of the train dataset for dist dataset. ") parser.add_argument( "--valid_path", type=str, default= "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_valid.txt", help="Path of the valid dataset for dist dataset. ") parser.add_argument("--dataset_cache", type=str, default="dataset_cache", help="Path or url of the dataset cache") parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path") parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading") parser.add_argument("--n_epochs", type=int, default=70, help="Number of training epochs") parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history") parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=64, help="Accumulate gradients on several steps") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. # logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config tokenizer_class = BertTokenizer if args.pretrained: tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, do_lower_case=True) model = model_class.from_pretrained(args.model_checkpoint) else: tokenizer = tokenizer_class(os.path.join(args.model_checkpoint, "vocab.txt"), do_lower_case=True) config = config_class.from_json_file( os.path.join(args.model_checkpoint, CONFIG_NAME)) model = model_class(config) model.to(args.device) optimizer = AdamW([{ 'params': model.parameters(), 'initial_lr': args.lr }], lr=args.lr, correct_bias=True) logger.info("Prepare datasets") loader_class = build_dist_loaders if not args.data_path else build_dataloaders train_loader, val_loader, train_sampler, valid_sampler = loader_class( args, tokenizer, logger) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # Training function and trainer def update(engine, batch): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) model.train() (lm_loss), *_ = model(input_ids, labels=lm_labels, token_type_ids=token_type_ids) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Evaluation during training @trainer.on(Events.ITERATION_STARTED) def log_iterations(engine): # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0: if engine.state.iteration % args.valid_steps == 0: evaluator.run(val_loader) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # noam decrease the learning rate # model_size = model.config.n_embd model_size = args.n_emd noam_lambda = lambda step: (model_size**(-0.5) * min( (step + 1)**(-0.5), (step + 1) * args.warmup_steps**(-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0], x[1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints # And save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True, mininterval=2) pbar.attach(trainer, metric_names=["loss", "lr"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=3) # save model after evaluation evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint # (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def __rmul__(self, other): from ignite.metrics import MetricsLambda return MetricsLambda(lambda x, y: x * y, other, self)
def create_evaluator(model, cfg): def _validation_step(_, batch): model.eval() with torch.no_grad(): x_char, x_type, y_word, y_syllable = batch_to_tensor(batch, cfg) x_char, x_type, y_word, y_syllable = (t.to( cfg.device) for t in [x_char, x_type, y_word, y_syllable]) logits_word, logits_syllable = model(x_char, x_type) loss, word_loss, syllable_loss, align_loss = model.joint_loss( logits_word, y_word, logits_syllable, y_syllable) return ((logits_word > 0.5).long(), y_word, (logits_syllable > 0.5).long(), y_syllable, loss, word_loss, syllable_loss, align_loss) evaluator = Engine(_validation_step) w_loss = Accuracy(lambda x: x[0:2]) w_loss.attach(evaluator, 'w_acc') s_acc = Accuracy(lambda x: x[2:4]) s_acc.attach(evaluator, 's_acc') Average(lambda x: x[4]).attach(evaluator, 'loss') Average(lambda x: x[5]).attach(evaluator, 'w_loss') Average(lambda x: x[6]).attach(evaluator, 's_loss') Average(lambda x: x[7]).attach(evaluator, 'a_loss') accuracy = Accuracy(lambda x: x[0:2]) accuracy.attach(evaluator, "acc") w_precision = Precision(lambda x: x[0:2]) w_precision.attach(evaluator, 'WP') MetricsLambda(lambda t: torch.mean(t).item(), w_precision).attach(evaluator, "WMP") s_precision = Precision(lambda x: x[2:4]) s_precision.attach(evaluator, 'SP') MetricsLambda(lambda t: torch.mean(t).item(), s_precision).attach(evaluator, "SMP") w_recall = Recall(lambda x: x[0:2]) w_recall.attach(evaluator, 'WR') MetricsLambda(lambda t: torch.mean(t).item(), w_recall).attach(evaluator, "WMR") s_recall = Recall(lambda x: x[2:4]) s_recall.attach(evaluator, 'SR') MetricsLambda(lambda t: torch.mean(t).item(), s_recall).attach(evaluator, "SMR") w_f1 = 2. * w_precision * w_recall / (w_precision + w_recall + 1e-20) w_f1 = MetricsLambda(lambda t: torch.mean(t).item(), w_f1) w_f1.attach(evaluator, "WF1") s_f1 = 2. * s_precision * s_recall / (s_precision + s_recall + 1e-20) s_f1 = MetricsLambda(lambda t: torch.mean(t).item(), s_f1) s_f1.attach(evaluator, "SF1") return evaluator
# Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args, args.model_checkpoint) tb_logger = TensorboardLogger(log_dir)
def test_metrics_lambda_update_and_attach_together(): y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() def update_fn(engine, batch): y_pred, y = batch return y_pred, y engine = Engine(update_fn) precision = Precision(average=False) recall = Recall(average=False) def Fbeta(r, p, beta): return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) F1.attach(engine, "f1") with pytest.raises(ValueError, match=r"MetricsLambda is already attached to an engine"): F1.update((y_pred, y)) y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() F1 = MetricsLambda(Fbeta, recall, precision, 1) F1.update((y_pred, y)) engine = Engine(update_fn) with pytest.raises(ValueError, match=r"The underlying metrics are already updated"): F1.attach(engine, "f1") F1.reset() F1.attach(engine, "f1")
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n'+json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders(tb, vb) # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb) weights = (1/num_of_images)/((1/num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained('efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics class DOCLoss(nn.Module): def __init__(self, weight): super(DOCLoss, self).__init__() self.class_weights = weight def forward(self, input, target): sigmoid = 1 - 1 / (1 + torch.exp(-input)) sigmoid[range(0, sigmoid.shape[0]), target] = 1 - sigmoid[range(0, sigmoid.shape[0]), target] sigmoid = torch.log(sigmoid) if self.class_weights is not None: loss = -torch.sum(sigmoid * self.class_weights) else: loss = -torch.sum(sigmoid) return loss class DOCPrediction(metric.Metric): def __init__(self, threshold=torch.tensor([0.5]).repeat(len(train_loader.dataset.classes))): super(DOCPrediction, self).__init__() threshold = threshold.to(device=device) self.threshold = threshold self.prediction = torch.tensor([], dtype=torch.int) self.y = torch.tensor([], dtype=torch.int) def reset(self): self.threshold = torch.tensor([0.5]).repeat(len(train_loader.dataset.classes)).to(device=device) self.prediction = torch.tensor([]) self.y = torch.tensor([]) super(DOCPrediction, self).reset() def update(self, output): y_pred, y = output sigmoid = 1 / (1 + torch.exp(-y_pred)) values, inds = sigmoid.max(1) prediction = torch.where(values>self.threshold[inds], inds, torch.tensor([-1]).to(device=device)) self.prediction = torch.cat((self.prediction.type(torch.LongTensor).to(device=device), torch.tensor([mapping[x.item()] for x in prediction]).to(device=device))) self.y = torch.cat((self.y.type(torch.LongTensor).to(device=device), y.to(device=device))) # return self.prediction, self.y def compute(self): return self.prediction, self.y train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(DOCLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } val_metrics = { 'accuracy': MetricsLambda(Labels2Acc, DOCPrediction()), 'precision_recall': MetricsLambda(Labels2PrecisionRecall, DOCPrediction(), val_loader.dataset.classes), 'cmatrix': MetricsLambda(Labels2CMatrix, DOCPrediction(), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, DOCLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=desc.format(0) ) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) # Compute metrics on train data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() print ('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch,avg_accuracy,avg_loss,precision_recall['pretty'],cmatrix['pretty']) tqdm.write(prompt) logging.info('\n'+prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # Compute metrics on val data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): print ('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Validating Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch,avg_accuracy,precision_recall['pretty'],cmatrix['pretty']) tqdm.write(prompt) logging.info('\n'+prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Score', {'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1]}, engine.state.epoch) pbar.n = pbar.last_print_n = 0 # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=50, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def test_metrics_lambda(): m0 = ListGatherMetric(0) m1 = ListGatherMetric(1) m2 = ListGatherMetric(2) def process_function(engine, data): return data engine = Engine(process_function) def plus(this, other): return this + other m0_plus_m1 = MetricsLambda(plus, m0, other=m1) m2_plus_2 = MetricsLambda(plus, m2, 2) m0_plus_m1.attach(engine, "m0_plus_m1") m2_plus_2.attach(engine, "m2_plus_2") engine.run([[1, 10, 100]]) assert engine.state.metrics["m0_plus_m1"] == 11 assert engine.state.metrics["m2_plus_2"] == 102 engine.run([[2, 20, 200]]) assert engine.state.metrics["m0_plus_m1"] == 22 assert engine.state.metrics["m2_plus_2"] == 202 # metrics are partially attached assert not m0.is_attached(engine) assert not m1.is_attached(engine) assert not m2.is_attached(engine) # a dependency is detached m0.detach(engine) # so the lambda metric is too assert not m0_plus_m1.is_attached(engine) # the lambda is attached again m0_plus_m1.attach(engine, "m0_plus_m1") assert m0_plus_m1.is_attached(engine) # metrics are always partially attached assert not m0.is_attached(engine) m0_plus_m1.detach(engine) assert not m0_plus_m1.is_attached(engine) # detached (and no longer partially attached) assert not m0.is_attached(engine)
def __floordiv__(self, other): from ignite.metrics import MetricsLambda return MetricsLambda(lambda x, y: x // y, self, other)
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default='wikitext-2', help="One of ('wikitext-103', 'wikitext-2') or a dict of splits paths." ) parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--embed_dim", type=int, default=410, help="Embeddings dim") parser.add_argument("--hidden_dim", type=int, default=2100, help="Hidden dimension") parser.add_argument("--num_max_positions", type=int, default=256, help="Max input length") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=16, help="NUmber of layers") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--initializer_range", type=float, default=0.02, help="Normal initialization standard deviation") parser.add_argument("--sinusoidal_embeddings", action="store_true", help="Use sinusoidal embeddings") parser.add_argument( "--mlm", action="store_true", help= "Train with masked-language modeling loss instead of language modeling" ) parser.add_argument( "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=8, help="Batch size for validation") parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--n_epochs", type=int, default=200, help="Number of training epochs") parser.add_argument("--n_warmup", type=int, default=1000, help="Number of warmup iterations") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradient") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat( args)) # This is a logger.info: only printed on the first process # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, model and optimizer") tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Let's use a pre-defined tokenizer args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original def mask_tokens(inputs): labels = inputs.clone() masked_indices = torch.bernoulli( torch.full(labels.shape, args.mlm_probability)).byte() labels[~masked_indices] = -1 # We only compute loss on masked tokens indices_replaced = torch.bernoulli(torch.full( labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = tokenizer.vocab[ "[MASK]"] # 80% of the time, replace masked input tokens with [MASK] indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[ indices_random] # 10% of the time, replace masked input tokens with random word return inputs, labels # Training function and trainer def update(engine, batch): model.train() inputs = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] inputs, labels = mask_tokens(inputs) if args.mlm else ( inputs, inputs) # Prepare masked input/labels if we use masked LM logits, loss = model(inputs, labels=labels) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): inputs = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] inputs, labels = mask_tokens(inputs) if args.mlm else ( inputs, inputs) # Prepare masked input/labels if we use masked LM logits = model(inputs) shift_logits = logits[:-1] if not args.mlm else logits shift_labels = labels[1:] if not args.mlm else labels return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading if args.local_rank in [-1, 0] and args.n_epochs > 0: tb_logger.close()
def __mod__(self, other): from ignite.metrics import MetricsLambda return MetricsLambda(lambda x, y: x % y, self, other)
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--use_adapter", default=False, action='store_true', help="Use adapter or not") parser.add_argument("--keyword_module", type=str, default="", help="add, attention, ") parser.add_argument("--model_checkpoint", type=str, default="bertGpt", help="Path, url or short name of the model") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--bert_model_path", default="./", type=str, help="Bert pre-trained model path") parser.add_argument( "--vocab_file", default="./vocab.korean.rawtext.list", type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") #tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path #tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Load KoBERT model and tokenizer bert_tokenizer = BertTokenizer.from_pretrained( args.vocab_file, do_lower_case=args.do_lower_case) bert_model = BertModel.from_pretrained(args.bert_model_path) bert_model.to(args.device) # Load KoGPT2 model and tokenizer tok_path = get_tokenizer() gpt_model, gpt_vocab = get_pytorch_kogpt2_model( keyword_module=args.keyword_module, use_adapter=args.use_adapter) gpt_tokenizer = SentencepieceTokenizer(tok_path) gpt_model.to(args.device) model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) #if args.fp16: #from apex import amp # Apex is only required if we use fp16 training #model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, bert_tokenizer, gpt_tokenizer, gpt_vocab) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch #(lm_loss), *_ = model(input_ids, token_type_ids=token_type_ids, labels=lm_labels) (lm_loss), *_ = model(source_ids, target_ids, key_score=keyword_scores, lm_labels=lm_labels) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch #lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids,) lm_logits, *_ = model(source_ids, target_ids, key_score=keyword_scores) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted), (lm_labels_flat_shifted) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint, args.dataset_path, args.use_adapter, args.keyword_module) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=2) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) #tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def __div__(self, other): from ignite.metrics import MetricsLambda return MetricsLambda(lambda x, y: x.__div__(y), self, other)
def MatthewsCorrelationCoefficient(output_transform): return MetricsLambda( _matthews_correlation_coefficient, ConfusionMatrix(num_classes=2, output_transform=output_transform) )
def train(args): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False) args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original def mask_tokens(inputs): labels = inputs.clone() masked_indices = torch.bernoulli( torch.full(labels.shape, args.mlm_probability)).byte() labels[~masked_indices] = -1 # We only compute loss on masked tokens indices_replaced = torch.bernoulli(torch.full( labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = tokenizer.vocab[ "[MASK]"] # 80% of the time, replace masked input tokens with [MASK] indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[ indices_random] # 10% of the time, replace masked input tokens with random word return inputs, labels def update(engine, batch): model.train() inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else (inputs, inputs) logits, loss = model(inputs, labels=labels) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(engine, batch): model.eval() with torch.no_grad(): inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else ( inputs, inputs) # Prepare masked input/labels if we use masked LM logits = model(inputs) shift_logits = logits[:-1] if not args.mlm else logits shift_labels = labels[1:] if not args.mlm else labels return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs)
def train(): args = parser.parse_args() if not os.path.exists( args.output_dir ): # !!!NOTICE: change output dir for each different settings. os.makedirs(args.output_dir) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES["gpt2"] config = config_class.from_pretrained(args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) tokenizer.add_tokens(SPECIAL_TOKENS) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) # model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) # Prepare optimizer and schedule (linear warmup and decay) # optimizer = OpenAIAdam(model.parameters(), lr=args.lr) optimizer = AdamW(model.parameters(), lr=args.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) cur_input_ids = batch[0] cur_lm_labels = batch[1] cur_token_type_ids = batch[2] model_outputs = model(input_ids=cur_input_ids, labels=cur_lm_labels, token_type_ids=cur_token_type_ids) lm_loss = model_outputs[0] loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, lm_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, :].tolist())) model_outputs = model(input_ids, token_type_ids=token_type_ids) lm_logits = model_outputs[0] lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) # tb_logger = TensorboardLogger(log_dir=args.output_dir) # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint( args.output_dir, 'checkpoint', save_interval=1, n_saved=3 ) # !!!NOTICE: if fill exist, it will report error. set require_empty=False can avoid this. trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, args.output_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(args.output_dir, CONFIG_NAME)) tokenizer.save_vocabulary(args.output_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(args.output_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner)
def train(dataset_path, dataset_cache='./dataset_cache', model_checkpoint='gpt2', num_candidates=2, max_history=2, train_batch_size=4, valid_batch_size=4, gradient_accumulation_steps=8, lr=6.25e-5, lm_coef=1.0, mc_coef=1.0, max_norm=1.0, n_epochs=3, personality_permutations=1, eval_before_start=False, device="cuda" if torch.cuda.is_available() else "cpu", fp16='', path_prefix='', log_dir='', local_rank=-1): args = {**locals()} # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if local_rank in [-1, 0] else logging.WARN) # This is a logger.warning: it will be printed by all distributed processes logger.warning("Running process %d", local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed distributed = (local_rank != -1) args['distributed'] = distributed if distributed: torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") # cant use Autotokenizer because checkpoint could be a Path tokenizer_class = GPT2Tokenizer if "gpt2" in model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(model_checkpoint) model.to(device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=fp16) if distributed: model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( dataset_path, dataset_cache, num_candidates, personality_permutations, max_history, train_batch_size, valid_batch_size, distributed, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * lm_coef + mc_loss * mc_coef) / \ gradient_accumulation_steps if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) if engine.state.iteration % gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], local_rank, device), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], local_rank, device) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = log_dir if log_dir else make_logdir(model_checkpoint, path=path_prefix) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if local_rank in [-1, 0] and n_epochs > 0: # TODO: PR in ignite to have better access to saved file paths (cleaner) os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME)) tb_logger.close()