def test_no_torch_utils_tensorboard_package(dirname): from tensorboardX import SummaryWriter with patch.dict("sys.modules", {"torch.utils.tensorboard": None}): tb_logger = TensorboardLogger(log_dir=dirname) assert isinstance(tb_logger.writer, SummaryWriter), type(tb_logger.writer) tb_logger.close()
def add_tensorboard(engine_train, optimizer, model, log_dir): """Creates an ignite logger object and adds training elements such as weight and gradient histograms Args: engine_train (:obj:`ignite.engine`): the train engine to attach to the logger optimizer (:obj:`torch.optim`): the model's optimizer model (:obj:`torch.nn.Module`): the model being trained log_dir (string): path to where tensorboard data should be saved """ # Create a logger tb_logger = TensorboardLogger(log_dir=log_dir) # Attach the logger to the trainer to log training loss at each iteration tb_logger.attach(engine_train, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {"loss": loss}), event_name=Events.ITERATION_COMPLETED) # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration tb_logger.attach(engine_train, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.EPOCH_COMPLETED) # Attach the logger to the trainer to log model's weights as a histogram after each epoch tb_logger.attach(engine_train, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) # Attach the logger to the trainer to log model's gradients as a histogram after each epoch tb_logger.attach(engine_train, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.close()
def setup_training_loggers(name, level=logging.INFO): log_setup = logging.getLogger(name) os.makedirs(os.path.join(cfg.tb_log_folder, name), exist_ok=True) tb_logger = TensorboardLogger(os.path.join(cfg.tb_log_folder, name)) os.makedirs(os.path.join(cfg.checkpoint_log_folder, name), exist_ok=True) checkpoint_handler = ModelCheckpoint(os.path.join( cfg.checkpoint_log_folder, name), 'checkpoint', save_interval=1, n_saved=3, require_empty=False) if len(log_setup.handlers) == 2: # Logger already set up for current run return log_setup, tb_logger, checkpoint_handler fileHandler = logging.FileHandler(os.path.join(cfg.validation_log_folder, name + '.log'), mode='a') formatter = logging.Formatter('%(levelname)s: %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') fileHandler.setFormatter(formatter) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(formatter) log_setup.setLevel(level) log_setup.addHandler(fileHandler) log_setup.addHandler(consoleHandler) return logging.getLogger(name), tb_logger, checkpoint_handler
def get_tensorboard_logger(trainer: Engine, evaluators: ThreeEvaluators, metric_names: List[str]) -> TensorboardLogger: """ creates a ``tensorboard`` logger which read metrics from given evaluators and attaches it to a given trainer :param trainer: an ``ignite`` trainer to attach to :param evaluators: a triple of train, validation, and test evaluators to get metrics from :param metric_names: a list of metrics to log during validation and testing """ tb_logger = TensorboardLogger(log_dir=f"runs/{datetime.now()}", flush_secs=1) training_loss = OutputHandler( "training", ["running_loss"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach(trainer, training_loss, Events.EPOCH_COMPLETED) validation_loss = OutputHandler( "validation", metric_names, global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach(evaluators.validation, validation_loss, Events.COMPLETED) test_loss = OutputHandler( "test", metric_names, global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach(evaluators.test, test_loss, Events.COMPLETED) return tb_logger
def add_logging_and_checkpoint_saving(trainer, evaluator, metrics, model, optimizer, args, prefix=""): """ Add to training engine tensorboard logging, progress bar with average loss, checkpoint saving and save training config. """ # Add progress bar with average loss RunningAverage(output_transform=lambda x: x).attach(trainer, prefix + "loss") pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=[prefix + "loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) # Add tensorboard logging with training and evaluation metrics tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=[prefix + "loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) @evaluator.on(Events.COMPLETED) def tb_log_metrics(engine): for name in metrics.keys(): tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration) # Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()') checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # Save training configuration torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) return checkpoint_handler, tb_logger
def test_integration_as_context_manager(dirname): n_epochs = 5 data = list(range(50)) losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) def update_fn(engine, batch): return next(losses_iter) with TensorboardLogger(log_dir=dirname) as tb_logger: trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) logger.writer.add_scalar("test_value", global_step, global_step) tb_logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) trainer.run(data, max_epochs=n_epochs) # Check if event files are present written_files = os.listdir(dirname) written_files = [f for f in written_files if "tfevents" in f] assert len(written_files) > 0
def custom_setup(self): if self.tensorboard_logs: tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs) tb_logger.attach(self.trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.evaluator, log_handler=OutputHandler( tag="validation", metric_names=["LossMetric"], another_engine=self.trainer), event_name=Events.EPOCH_COMPLETED) if self.optional_tensorboard_features: tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler( self.optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(self.trainer, log_handler=WeightsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.trainer, log_handler=WeightsHistHandler(self.model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=GradsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) # This is important to close the tensorboard file logger @self.trainer.on(Events.COMPLETED) def end_tensorboard(trainer): logger.info("Training completed") tb_logger.close() if self.embeddings_name: @self.trainer.on(Events.COMPLETED) def log_embeddings(trainer): if hasattr(self.model, self.embeddings_name) and hasattr( self.dataset_splits, "vectorizer") and TENSORBOARD: logger.info( f"Logging embeddings ({self.embeddings_name}) to Tensorboard!" ) embeddings = getattr(self.model, self.embeddings_name).weight.data metadata = [ str(self.dataset_splits.vectorizer.data_vocab. _id2token[token_index]).encode('utf-8') for token_index in range(embeddings.shape[0]) ] self.writer.add_embedding( mat=embeddings, metadata=metadata, global_step=self.trainer.state.epoch)
def test_no_tensorboardX_nor_torch_utils_tensorboard(): with patch.dict("sys.modules", { "tensorboardX": None, "torch.utils.tensorboard": None }): with pytest.raises( RuntimeError, match= r"This contrib module requires either tensorboardX or torch"): TensorboardLogger(log_dir=None)
def setup_ignite(engine: Engine, params: SimpleNamespace, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): simplefilter("ignore", category=UserWarning) handler = EndOfEpisodeHandler(exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) EpisodeFPSHandler().attach(engine) @engine.on(EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print( "Episode {}: reward={:.0f}, steps={}, speed={:.1f} f/s, elapsed={}" .format(trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)))) @engine.on(EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics["time_passed"] print( f"Game solved in {timedelta(seconds=int(passed))} after {trainer.state.episode}" f" episodes and {trainer.state.iteration} iterations!") trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes").replace(":", "-") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = OutputHandler(tag="episodes", metric_names=metrics) event = EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def train(): device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") model = Bert_SQG() optimizer = AdamW(model.parameters(), lr=3e-5) ds = dataloader.BertSQG_DataClass() dl = DataLoader(ds, num_workers=4, batch_size=4) scheduler = PiecewiseLinear(optimizer, "lr", [(0, 3e-5), (EPOCHS * len(ds) // BATCH_SIZE, 0.0)]) metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} def update(engine, batch): model.train() for i in range(0, len(batch) - 1): x = batch[i].to(device) y = batch[i + 1].to(device) y_prime = model(x) loss = criterion(y_prime[-1], y[-1]) / ITERATION_STEP loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if engine.state.iteration % ITERATION_STEP == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) tb_logger = TensorboardLogger(log_dir='./logs') tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) checkpoint_handler = ModelCheckpoint('./checkpoint', '_checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'bert_sqg': getattr(model, 'module', model)}) trainer.run(dl, max_epochs=EPOCHS) tb_loger.close()
def add_tensorboard_logging(self, logging_dir=None): # Add TensorBoard logging if logging_dir is None: os.path.join(self.config.DIRS.WORKING_DIR, 'tb_logs') else: os.path.join(logging_dir, 'tb_logs') print('Tensorboard logging saving to:: {} ...'.format(logging_dir), end='') self.tb_logger = TensorboardLogger(log_dir=logging_dir) # Logging iteration loss self.tb_logger.attach_output_handler( engine=self.train_engine, event_name=Events.ITERATION_COMPLETED, tag='training', output_transform=lambda loss: {"batch loss": loss}) # Logging epoch training metrics self.tb_logger.attach_output_handler( engine=self.train_evaluator, event_name=Events.EPOCH_COMPLETED, tag="training", metric_names=[ "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc" ], global_step_transform=global_step_from_engine(self.train_engine), ) # Logging epoch validation metrics self.tb_logger.attach_output_handler( engine=self.evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=[ "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc" ], global_step_transform=global_step_from_engine(self.train_engine), ) # Attach the logger to the trainer to log model's weights as a histogram after each epoch self.tb_logger.attach(self.train_engine, event_name=Events.EPOCH_COMPLETED, log_handler=WeightsHistHandler(self.model)) # Attach the logger to the trainer to log model's gradients as a histogram after each epoch self.tb_logger.attach(self.train_engine, event_name=Events.EPOCH_COMPLETED, log_handler=GradsHistHandler(self.model)) print('Tensorboard Logging...', end='') print('done')
def __init__(self, name, model, log_dir, lr, lr_decay_step, adam=False): """ Initialize to train the given model. :param name: The name of the model to be trained. :param model: The model to be trained. :param log_dir: String. The log directory of the tensorboard. :param lr: Float. The learning rate. :param lr_decay_step: Integer. The amount of steps the learning rate decays. :param adam: Bool. Whether to use adam optimizer or not. """ super(Trainer, self).__init__(self.update_model) self.model = model # tqdm ProgressBar(persist=True).attach(self) # Optimizer params = [p for p in model.parameters() if p.requires_grad] if adam: self.optimizer = torch.optim.Adam(params, lr=lr) else: self.optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9) # Scheduler if lr_decay_step > 0: self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=lr_decay_step, gamma=0.1) self.add_event_handler(Events.EPOCH_COMPLETED, lambda e: e.scheduler.step()) else: self.scheduler = None # Terminate if nan values found self.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) # Tensorboard logging self.tb_logger = TensorboardLogger(log_dir=os.path.join(log_dir, name)) self.add_event_handler(Events.COMPLETED, lambda x: self.tb_logger.close()) self.tb_logger.attach(self, log_handler=OptimizerParamsHandler(self.optimizer), event_name=Events.EPOCH_COMPLETED) self.tb_logger.attach(self, log_handler=OutputHandler(tag='training', output_transform=lambda x: { 'rpn_box_loss': round(self.state.output['loss_rpn_box_reg'].item(), 4), 'rpn_cls_loss': round(self.state.output['loss_objectness'].item(), 4), 'roi_box_loss': round(self.state.output['loss_box_reg'].item(), 4), 'roi_cls_loss': round(self.state.output['loss_classifier'].item(), 4) }), event_name=Events.EPOCH_COMPLETED) # Run on GPU (cuda) if available if torch.cuda.is_available(): torch.cuda.set_device(int(get_free_gpu())) model.cuda(torch.cuda.current_device())
def evaluate(config): device = torch.device('cuda' if config['use_cuda'] else 'cpu') model = architecture.Model().to(device) train_state = dict(model=model) print('Loading model checkpoint') workflow.ignite.handlers.ModelCheckpoint.load( train_state, 'model/checkpoints', device ) @workflow.ignite.decorators.evaluate(model) def evaluate_batch(engine, examples): predictions = model.predictions( architecture.FeatureBatch.from_examples(examples) ) loss = predictions.loss(examples) return dict( examples=examples, predictions=predictions.cpu().detach(), loss=loss, ) evaluate_data_loaders = { f'evaluate_{name}': datastream.data_loader( batch_size=config['eval_batch_size'], num_workers=config['n_workers'], collate_fn=tuple, ) for name, datastream in datastream.evaluate_datastreams().items() } tensorboard_logger = TensorboardLogger(log_dir='tb') for desciption, data_loader in evaluate_data_loaders.items(): engine = evaluator( evaluate_batch, desciption, metrics.evaluate_metrics(), tensorboard_logger, ) engine.run(data=data_loader)
def logging_board(model_name="densenet121"): from ignite.contrib.handlers.tensorboard_logger import ( TensorboardLogger, OutputHandler, OptimizerParamsHandler, GradsHistHandler, ) tb_logger = TensorboardLogger("board/" + model_name) tb_logger.attach( trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {"loss": loss}), event_name=Events.ITERATION_COMPLETED, ) tb_logger.attach( val_evaluator, log_handler=OutputHandler( tag="validation", metric_names=["accuracy", "loss"], another_engine=trainer, ), event_name=Events.EPOCH_COMPLETED, ) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(IGTrainer.optimizer), event_name=Events.ITERATION_STARTED, ) tb_logger.attach( trainer, log_handler=GradsHistHandler(IGTrainer.model), event_name=Events.EPOCH_COMPLETED, ) tb_logger.close()
def setup(self, training_metrics): def metric_name(n) -> str: if n.endswith('Accuracy'): n = 'acc' else: n = n[:-6] if n.endswith('Metric') else n return n def print_metrics(metrics) -> str: rv = '' metric_keys = sorted(k for k in metrics) for k in metric_keys: if k == 'Accuracy': rv += f'{metric_name(k)}: {metrics[k]:.3}' else: rv += f'{metric_name(k)}: {metrics[k]:.6}' return rv if self.seed: set_seed_everywhere(self.seed, self.cuda) pbar = ProgressBar() names = [] for k, v in training_metrics.items(): name = f'r{k}' names.append(name) RunningAverage(v).attach(self.trainer, name) RunningAverage(None, output_transform=lambda x: x[-1] * self. loss_accumulation_steps).attach(self.trainer, 'rloss') names.append('rloss') pbar.attach(self.trainer, names) pbar = ProgressBar() pbar.attach(self.evaluator) # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC # Ignite provides the necessary abstractions and a furnished repository of useful tools @self.trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): self.evaluator.run(self.dataset_splits.val_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.scheduler: self.scheduler.step( metrics[self.loss_metric.__class__.__name__]) @self.trainer.on(Events.COMPLETED) def log_test_results(trainer): self.evaluator.run(self.dataset_splits.test_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.tensorboard_logs: tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs) tb_logger.attach(self.trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.evaluator, log_handler=OutputHandler( tag="validation", metric_names=["LossMetric"], another_engine=self.trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler( self.optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(self.trainer, log_handler=WeightsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.trainer, log_handler=WeightsHistHandler(self.model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=GradsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) # This is important to close the tensorboard file logger @self.trainer.on(Events.COMPLETED) def end_tensorboard(trainer): logger.info("Training completed") tb_logger.close() if self.embeddings_name: @self.trainer.on(Events.COMPLETED) def log_embeddings(trainer): if hasattr(self.model, self.embeddings_name) and hasattr( self.dataset_splits, "vectorizer"): logger.info( f"Logging embeddings ({self.embeddings_name}) to Tensorboard!" ) embeddings = getattr(self.model, self.embeddings_name).weight.data metadata = [ str(self.dataset_splits.vectorizer.data_vocab. _id2token[token_index]).encode('utf-8') for token_index in range(embeddings.shape[0]) ] self.writer.add_embedding( mat=embeddings, metadata=metadata, global_step=self.trainer.state.epoch)
def _create_tb_logger(self) -> TensorboardLogger: now = datetime.now().isoformat(timespec='minutes') now = now.replace(":", "") logdir = f"{self.logfolder}/{now}-{self.run_name}" return TensorboardLogger(log_dir=logdir)
# Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args,args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs)
def train(): parser = ArgumentParser() parser.add_argument("--train_path", type=str, default='data/spolin-train-acl.json', help="Set data path") parser.add_argument("--valid_path", type=str, default='data/spolin-valid.json', help="Set data path") parser.add_argument("--correct_bias", type=bool, default=False, help="Set to true to correct bias for Adam optimizer") parser.add_argument("--lr", type=float, default=2e-5, help="Set learning rate") parser.add_argument("--n_epochs", type=int, default=4, help="Set number of epochs") parser.add_argument("--num_warmup_steps", type=float, default=1000, help="Set number of warm-up steps") parser.add_argument("--num_total_steps", type=float, default=10000, help="Set number of total steps") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Set maximum gradient normalization.") parser.add_argument("--pretrained_path", type=str, default='bert-base-uncased', help="Choose which pretrained model to use (bert-base-uncased, roberta-base, roberta-large, roberta-large-mnli)") parser.add_argument("--batch_size", type=int, default=32, help="Provide the batch size") parser.add_argument("--random_seed", type=int, default=42, help="Set the random seed") parser.add_argument("--test", action='store_true', help="If true, run with small dataset for testing code") parser.add_argument("--base", action='store_true', help="If true, run with base experiment configuration (training with spont only) for comparison") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger.info("Arguments: {}".format(pformat(args))) if 'roberta' in args.pretrained_path: # initialize tokenizer and model logger.info("Initialize model and tokenizer.") tokenizer = RobertaTokenizer.from_pretrained(args.pretrained_path, cache_dir = '../pretrained_models') model = RobertaForSequenceClassification.from_pretrained(args.pretrained_path, cache_dir='../pretrained_models') ### START MODEL MODIFICATION # Pretrained model was not trained with token type ids. # fix token type embeddings for finetuning. Without this, the model can only take 0s as valid input for token_type_ids model.config.type_vocab_size = 2 model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(2, model.config.hidden_size) model.roberta.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=model.config.initializer_range) ### END MOD elif 'bert' in args.pretrained_path: model = BertForSequenceClassification.from_pretrained(args.pretrained_path, cache_dir='../pretrained_models') tokenizer = BertTokenizer.from_pretrained(args.pretrained_path, cache_dir='../pretrained_models') model.to(args.device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, correct_bias = args.correct_bias) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.num_warmup_steps, t_total=args.num_total_steps) logger.info("Prepare datasets") logger.info("Loading train set...") train_data = get_data(args.train_path) valid_data = get_data(args.valid_path) cornell_valid_data = {k: {'cornell': valid_data[k]['cornell']} for k in valid_data.keys()} spont_valid_data = {k: {'spont': valid_data[k]['spont']} for k in valid_data.keys()} train_loader, train_sampler = get_data_loaders(args, train_data, args.train_path, tokenizer) logger.info("Loading validation set...") valid_p = Path(args.valid_path) cornell_valid_loader, cornell_valid_sampler = get_data_loaders(args, cornell_valid_data, f"{str(valid_p.parent)}/cornell_{valid_p.name}", tokenizer) spont_valid_loader, spont_valid_sampler = get_data_loaders(args, spont_valid_data, f"{str(valid_p.parent)}/spont_{valid_p.name}", tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) b_input_ids, b_input_mask, b_input_segment, b_labels = batch optimizer.zero_grad() #roberta has issues with token_type_ids loss, logits = model(b_input_ids, token_type_ids=b_input_segment, attention_mask=b_input_mask, labels=b_labels) # loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() return loss.item(), logits, b_labels trainer = Engine(update) # Evaluation function and evaluator def inference(engine, batch): model.eval() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) b_input_ids, b_input_mask, b_input_segment, b_labels = batch with torch.no_grad(): #roberta has issues with token_type_ids # loss, logits = model(b_input_ids, token_type_ids = None, attention_mask=b_input_mask, labels=b_labels) loss, logits = model(b_input_ids, token_type_ids = b_input_segment, attention_mask=b_input_mask, labels=b_labels) label_ids = b_labels return logits, label_ids, loss.item() cornell_evaluator = Engine(inference) spont_evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: cornell_evaluator.run(cornell_valid_loader)) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: spont_evaluator.run(spont_valid_loader)) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(Accuracy(output_transform=lambda x: (x[1], x[2]))).attach(trainer, "accuracy") if torch.cuda.is_available(): GpuInfo().attach(trainer, name='gpu') recall = Recall(output_transform=lambda x: (x[0], x[1])) precision = Precision(output_transform=lambda x: (x[0], x[1])) F1 = (precision * recall * 2 / (precision + recall)).mean() accuracy = Accuracy(output_transform=lambda x: (x[0], x[1])) metrics = {"recall": recall, "precision": precision, "f1": F1, "accuracy": accuracy, "loss": Average(output_transform=lambda x: x[2])} for name, metric in metrics.items(): metric.attach(cornell_evaluator, name) metric.attach(spont_evaluator, name) pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss', 'accuracy']) pbar.attach(trainer, metric_names=['gpu:0 mem(%)', 'gpu:0 util(%)']) cornell_evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Cornell validation metrics:\n %s" % pformat(cornell_evaluator.state.metrics))) spont_evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Spont validation metrics:\n %s" % pformat(spont_evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(cornell_evaluator, log_handler=OutputHandler(tag="valid", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(spont_evaluator, log_handler=OutputHandler(tag="valid", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # tb_logger.writer.log_dir -> tb_logger.writer.logdir (this is the correct attribute name as seen in: https://tensorboardx.readthedocs.io/en/latest/_modules/tensorboardX/writer.html#SummaryWriter) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=5) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) trainer.run(train_loader, max_epochs = args.n_epochs) if args.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def main(): args = get_args() if 'e-SNLI-VE' in args.data_path: args.no_image = False else: args.no_image = True if not args.no_image: args.no_premise = True args.with_expl = True '''Setup''' t = datetime.today() output_dir = os.path.join(args.output_folder, f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}") if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(filename=os.path.join(output_dir, 'app.log'), filemode='a', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # This is a logger.warning: it will be printed by all distributed processes logger.warning(f"Running process {args.local_rank}") logger.info(f"Arguments: {pformat(args)}") logger.info(f'Image not used:{args.no_image}') logger.info(f'Premise not used:{args.no_premise}') logger.info(f'Explanations used:{args.with_expl}') '''Initialize distributed training if needed''' args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) if args.no_image: model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: import image_gpt2_291 model = image_gpt2_291.GPT2LMHeadModel.from_pretrained( args.model_checkpoint) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr) ''' Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) ''' if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model = model.module logger.info("Prepare datasets") train_loader, val_loader = get_data_loaders(args, tokenizer) '''Training function and trainer''' def train(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask, labels=lm_label) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask, labels=lm_label) loss, logits, _ = output loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() if not args.with_expl: lbl_accuracy = torch.eq(label, logits.argmax( dim=1)).float().sum() / len(label) return { 'loss': loss.item(), 'lbl_accuracy': lbl_accuracy.item() } else: if engine.state.iteration % (args.gradient_accumulation_steps * 500) == 0: input_output = list(zip(input_ids, logits)) random_item = random.choice(input_output) in_sent = tokenizer.decode(list(filter( lambda x: x != tokenizer.eos_token_id, random_item[0]))) out_expl = tokenizer.decode(random_item[1].argmax(dim=1), skip_special_tokens=True) logger.info(f'MODEL INPUT: {in_sent}') logger.info(f'GEN. EXPL {out_expl}') logger.info('--------------------------------') return { 'loss': loss.item(), } '''Validation function and validator (validator output is the input of the metrics)''' def validation(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask ) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask ) logits, _ = output logits_shifted = logits[..., :-1, :].contiguous().view(-1, logits.size(-1)) labels_shifted = lm_label[..., 1:].contiguous().view(-1) return logits_shifted, labels_shifted '''Engines''' trainer = Engine(train) validator = Engine(validation) # t_total = len( # train_loader) // args.gradient_accumulation_steps * args.n_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) '''Linearly decrease the learning rate from lr to zero''' scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) ''' Attach validation to trainer: we evaluate when we start the training and at the end of each epoch ''' trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: validator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: validator.run(val_loader)) '''Prepare metrics - note how we compute distributed metrics''' RunningAverage(output_transform=lambda x: x['loss']).attach( trainer, "loss") RunningAverage(output_transform=lambda x: math.exp( average_distributed_scalar(x['loss'], args))).attach(trainer, "ppl") if not args.with_expl: RunningAverage(output_transform=lambda x: 100 * x['lbl_accuracy']).attach( trainer, "lbl_accuracy") metrics = {} metrics["lbl_loss"] = Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0], x[1])) metrics["loss"] = MetricsLambda( lambda l, a: average_distributed_scalar( l / a.gradient_accumulation_steps, a), metrics["lbl_loss"], args) metrics["ppl"] = MetricsLambda(math.exp, metrics["loss"]) if not args.with_expl: metrics["lbl_accuracy"] = 100 * \ Accuracy(output_transform=lambda x: (x[0], x[1])) for name, metric in metrics.items(): metric.attach(validator, name) ''' On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train ''' if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss", 'ppl'] if args.with_expl else ["loss", 'lbl_accuracy', 'ppl']) validator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(validator.state.metrics))) tb_logger = TensorboardLogger(log_dir=output_dir) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["ppl"] if args.with_expl else ["lbl_accuracy", "ppl"]), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(validator, log_handler=OutputHandler( tag="validation", metric_names=[ 'ppl', 'loss'] if args.with_expl else['ppl', 'loss', 'lbl_accuracy'], global_step_transform=lambda *args, **kwargs: trainer.state.iteration), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(output_dir, 'checkpoint', n_saved=8, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, os.path.join(output_dir, 'model_training_args.bin')) getattr(model, 'module', model).config.to_json_file( os.path.join(output_dir, CONFIG_NAME)) tokenizer.save_vocabulary(output_dir) '''Run the training''' trainer.run(train_loader, max_epochs=args.n_epochs)
def train(args): logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer, _, vocab = get_kogpt2_tokenizer() model = get_kogpt2_model() model.to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) logger.info("Prepare datasets") train_loader, val_loader = get_data_loaders(args, tokenizer, vocab) def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, labels, token_type_ids = batch loss, *_ = model(input_ids, token_type_ids=token_type_ids, labels=labels) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses logits, *_ = model(input_ids, token_type_ids=token_type_ids) logits_flat_shifted = logits[..., :-1, :].contiguous().view( -1, logits.size(-1)) labels_flat_shifted = labels[..., 1:].contiguous().view(-1) return (logits_flat_shifted), (labels_flat_shifted) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1])), "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1])) } for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, # configuration and tokenizer before we start to train pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir("kogpt2_personachat") tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) # tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) # TODO: PR in ignite to have better access to saved file paths (cleaner) os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) tb_logger.close()
def train(): os.environ['CUDA_VISIBLE_DEVICES'] = '7' parser = ArgumentParser() parser.add_argument('--gpt2', action='store_true', help="use gpt2") parser.add_argument("--model_checkpoint", type=str, default="uer/gpt2-chinese-cluecorpussmall", help="Path or URL of the model") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', help="If False train from scratch") parser.add_argument("--data_path", type=str, default="data/autocloze.json", help="Path or url of the dataset. ") parser.add_argument("--train_path", type=str, default="data/toy_train.txt", help="Path of the train dataset for dist dataset. ") parser.add_argument("--valid_path", type=str, default="data/toy_valid.txt", help="Path of the valid dataset for dist dataset. ") #-------------------------------------------------------------- parser.add_argument("--dataset_cache", type=str, default="dataset_zh", help="Path or url of the dataset cache") parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path") parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading") parser.add_argument("--n_epochs", type=int, default=40, help="Number of training epochs") parser.add_argument("--train_batch_size", type=int, default=1, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation") parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history") parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=64, help="Accumulate gradients on several steps") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() print('cuda ',torch.cuda.is_available()) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. # logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) '''if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') ''' args.device = torch.device("cuda") print('device ',args.device) logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") #model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel #config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config model_class = GPT2LMHeadModel config_class = GPT2Config tokenizer_class = BertTokenizer print('pretrained:',args.pretrained) if args.pretrained: print("----------------pretrained") tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall") model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall",from_tf=True) #print('generate') #print(text_generator("这是很久之前的事情了", max_length=100, do_sample=True)) #args.device=torch.device("cuda", 2) model.to(args.device) optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True) logger.info("Prepare datasets") loader_class = build_dist_loaders if not args.data_path else build_dataloaders train_loader, val_loader, train_sampler, valid_sampler = loader_class(args, tokenizer, logger) logger.info("Prepare datasets ends") # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model=model.module #if isinstance(model,torch.nn.DataParallel): #print('params:',params_count(model)) #tokens_embed = model.transformer.get_input_embeddings() # Training function and trainer def update(engine, batch): input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch) #for i in range(input_ids.size()[0]): # for j in range(input_ids.size()[1]): # if input_ids[i,j]==-1: # input_ids[i,j]=-100 # if lm_labels[i,j]==-1: # lm_labels[i,j]=-100 #one=torch.tensor(-100) #input_ids=torch.where(input_ids==-1,one,input_ids) #lm_labels=torch.where(lm_labels==-1,one,lm_labels) #print('traindata',input_ids,lm_labels) #lm_labels=input_ids r'''input_shape = input_ids.siz`e`() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)''' model.train() #(lm_loss), *_ = model(inputs_embeds=inputs_embeds, labels=lm_labels,return_dict=0) (lm_loss), *_ = model(input_ids=input_ids, labels=lm_labels,return_dict=False) #print('lm_loss',lm_loss) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) cntepoch=0 def inference(engine, batch): model.eval() with torch.no_grad(): input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch) # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) #one = torch.tensor(-100) #input_ids=torch.where(input_ids==-1,one,input_ids) #print('validdata',input_ids,lm_labels) #lm_labels=input_ids r'''input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)''' #lm_logits, *_ = model(inputs_embeds=inputs_embeds,return_dict=0) lm_logits, *_ = model(input_ids=input_ids,return_dict=False) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted cntepoch+=1 torch.save(args, tb_logger.writer.logdir + '_%s/model_training_args.bin'%(str(cntepoch))) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Evaluation during training @trainer.on(Events.ITERATION_STARTED) def log_iterations(engine): # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0: if engine.state.iteration % args.valid_steps == 0: evaluator.run(val_loader) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # noam decrease the learning rate # model_size = model.config.n_embd model_size = args.n_emd noam_lambda = lambda step: ( model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints # And save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True, mininterval=2) pbar.attach(trainer, metric_names=["loss", "lr"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=6) # save model after evaluation evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'mymodel': getattr(model, 'module', model)}) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint # (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def attach_handlers(run, model, optimizer, learning_rule, trainer, evaluator, train_loader, val_loader, params): # Metrics UnitConvergence(model[0], learning_rule.norm).attach(trainer.engine, 'unit_conv') # Tqdm logger pbar = ProgressBar(persist=True, bar_format=config.IGNITE_BAR_FORMAT) pbar.attach(trainer.engine, metric_names='all') tqdm_logger = TqdmLogger(pbar=pbar) # noinspection PyTypeChecker tqdm_logger.attach_output_handler( evaluator.engine, event_name=Events.COMPLETED, tag="validation", global_step_transform=global_step_from_engine(trainer.engine), ) # Evaluator evaluator.attach(trainer.engine, Events.EPOCH_COMPLETED(every=100), train_loader, val_loader) # Learning rate scheduling lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda epoch: 1 - epoch / params['epochs']) lr_scheduler = LRScheduler(lr_scheduler) trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, lr_scheduler) # Early stopping mc_handler = ModelCheckpoint(config.MODELS_DIR, run.replace('/', '-'), n_saved=1, create_dir=True, require_empty=False, global_step_transform=global_step_from_engine(trainer.engine)) trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, mc_handler, {'m': model}) # Create a TensorBoard logger tb_logger = TensorboardLogger(log_dir=os.path.join(config.TENSORBOARD_DIR, run)) images, labels = next(iter(train_loader)) tb_logger.writer.add_graph(copy.deepcopy(model).cpu(), images) tb_logger.writer.add_hparams(params, {}) # noinspection PyTypeChecker tb_logger.attach_output_handler( evaluator.engine, event_name=Events.COMPLETED, tag="validation", metric_names="all", global_step_transform=global_step_from_engine(trainer.engine), ) # noinspection PyTypeChecker tb_logger.attach_output_handler( trainer.engine, event_name=Events.EPOCH_COMPLETED, tag="train", metric_names=["unit_conv"] ) input_shape = tuple(next(iter(train_loader))[0].shape[1:]) tb_logger.attach(trainer.engine, log_handler=WeightsImageHandler(model, input_shape), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer.engine, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.EPOCH_STARTED) # tb_logger.attach(trainer.engine, # log_handler=WeightsScalarHandler(model, layer_names=['linear1', 'linear2']), # event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=WeightsHistHandler(model, layer_names=['linear1', 'linear2']), # event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsHistHandler(model, layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=NumActivationsScalarHandler(model, layer_names=['repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsScalarHandler(model, reduction=torch.mean, # layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsScalarHandler(model, reduction=torch.std, # layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) return tb_logger
def train_network(model: nn.Module, training_loader: DataLoader, validation_loader: DataLoader): """Trains the given neural network model. Parameters ---------- model (nn.Module): The PyTorch model to be trained training_loader (DataLoader): Training data loader validation_loader (DataLoader): Validation data loader """ device = "cuda:0" if cast(Any, torch).cuda.is_available() else "cpu" if device == "cuda:0": model.cuda() optimizer = cast(Any, torch).optim.Adam(model.parameters(), lr=0.001) criterion = nn.MSELoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) save_handler = Checkpoint( { "model": model, "optimizer": optimizer, "trainer": trainer }, DiskSaver("dist/models", create_dir=True), n_saved=2, ) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=100), save_handler) # Create a logger tb_logger = TensorboardLogger(log_dir="logs/training" + datetime.now().strftime("-%Y%m%d-%H%M%S"), flush_secs=1) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED, tag="training", output_transform=lambda loss: {"loss": loss}, ) # Training evaluator training_evaluator = create_supervised_evaluator(model, metrics={ "r2": R2Score(), "MSELoss": Loss(criterion) }, device=device) tb_logger.attach_output_handler( training_evaluator, event_name=Events.EPOCH_COMPLETED, tag="training", metric_names=["MSELoss", "r2"], global_step_transform=global_step_from_engine(trainer), ) # Validation evaluator evaluator = create_supervised_evaluator(model, metrics={ "r2": R2Score(), "MSELoss": Loss(criterion) }, device=device) tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=["MSELoss", "r2"], global_step_transform=global_step_from_engine(trainer), ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_training_results(trainer): training_evaluator.run(training_loader) metrics = training_evaluator.state.metrics print( f"Training Results - Epoch: {trainer.state.epoch}", f" Avg r2: {metrics['r2']:.2f} Avg loss: {metrics['MSELoss']:.2f}", ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_validation_results(trainer): evaluator.run(validation_loader) metrics = evaluator.state.metrics print( f"Validation Results - Epoch: {trainer.state.epoch}", f" Avg r2: {metrics['r2']:.2f} Avg loss: {metrics['MSELoss']:.2f}\n", ) trainer.run(training_loader, max_epochs=int(1e6))
def trainer( train_batch, evaluate_batch, evaluate_data_loaders, metrics, optimizers, ): ''' Create standard trainer with evaluators. Parameters ---------- train_batch : function function that trains on given batch evaluate_batch : function function that evaluates a given batch evaluate_data_loaders: list data loaders that yield batches to evaluate on metrics : dict dict with one dict each for 'train' and evaluate data loader. Wrap a metric with trainer.Progress to show in progress bar. optimizers : dict dict with optimizers for logging Returns ------- tuple trainer engine list of evaluator engines tensorboard logger ''' trainer = ignite.engine.Engine(train_batch) for name, metric in metrics.get(PROGRESS_DESC, dict()).items(): metric.attach(trainer, name) for name, metric in metrics.get(TRAIN_DESC, dict()).items(): metric.attach(trainer, name) evaluators = { evaluator_name: ignite.engine.Engine(evaluate_batch) for evaluator_name in evaluate_data_loaders.keys() } for evaluator_name, evaluator in evaluators.items(): for metric_name, metric in metrics[evaluator_name].items(): metric.attach(evaluator, metric_name) tensorboard_logger = TensorboardLogger(log_dir='tb') EpochLogger().attach(trainer) # Order of attaching progress bars is important for vscode / atom ProgressBar(desc=TRAIN_DESC).attach(trainer, metric_names=list( metrics.get(PROGRESS_DESC, dict()).keys())) tensorboard_logger.attach( trainer, OutputHandler( tag=PROGRESS_DESC, metric_names=list(metrics.get(PROGRESS_DESC, dict()).keys()), ), Events.ITERATION_COMPLETED, ) MetricsLogger(TRAIN_DESC).attach(trainer, metrics.get(TRAIN_DESC, dict()).keys()) tensorboard_logger.attach( trainer, OutputHandler( tag=TRAIN_DESC, metric_names=list(metrics.get(TRAIN_DESC, dict()).keys()), ), Events.ITERATION_COMPLETED, ) def run_evaluator(evaluator_desc): return lambda engine: evaluators[evaluator_desc].run( evaluate_data_loaders[evaluator_desc]) for evaluator_desc, evaluator in evaluators.items(): evaluator_metric_names = list(metrics[evaluator_desc].keys()) trainer.add_event_handler( Events.EPOCH_COMPLETED, run_evaluator(evaluator_desc), ) ProgressBar(desc=evaluator_desc).attach(evaluator) MetricsLogger(evaluator_desc).attach(evaluator, evaluator_metric_names) tensorboard_logger.attach( evaluator, OutputHandler( tag=evaluator_desc, metric_names=evaluator_metric_names, global_step_transform=global_step_from_engine(trainer), ), Events.EPOCH_COMPLETED, ) if type(optimizers) is not dict: optimizers = dict(optimizer=optimizers) for name, optimizer in optimizers.items(): tensorboard_logger.attach( trainer, log_handler=OptimizerParamsHandler( tag=f'{TRAIN_DESC}/{name}', param_name='lr', optimizer=optimizer, ), event_name=Events.ITERATION_COMPLETED, ) return trainer, evaluators, tensorboard_logger
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") if sys.version_info > (3, ): from ignite.contrib.metrics.gpu_info import GpuInfo try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) tb_logger.close()
end_value=LR / 4, cycle_size=TOTAL_UPDATE_STEPS // 1) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'_': mude}) trainer.add_event_handler(Events.ITERATION_COMPLETED, nan_handler) trainer.add_event_handler(Events.ITERATION_COMPLETED, coslr) GpuInfo().attach(trainer, name='gpu') pbar.attach(trainer, output_transform=lambda output: {'loss': output['loss']}, metric_names=[f"gpu:{args.gpu} mem(%)"]) # FIRE tb_logger = TensorboardLogger(log_dir=TENSORBOARD_RUN_LOG_DIR_PATH) tb_logger.attach( trainer, log_handler=OutputHandler( tag='training', output_transform=lambda output: {'loss': output['loss']}), event_name=Events.ITERATION_COMPLETED( every=LOG_TRAINING_PROGRESS_EVERY_N)) tb_logger.attach( evaluator, log_handler=OutputHandler( tag='validation', metric_names='all', global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer,
def train(): config_file = "configs/train_daily_dialog_emotion_action_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Training function and trainer def update(engine, batch): model.train() input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple( input_tensor.to(config.device) for input_tensor in batch) lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids) loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids, token_action_ids=token_action_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.device_count() > 1 else "cpu") model = GPT2DoubleHeadsModel.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained("gpt2") DISTRIBUTED = args.local_rank != -1 if DISTRIBUTED and torch.distributed.is_available(): print("Distributed") torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') #BATCH_SIZE *= 2 def average_distributed_scalar(scalar): if (not DISTRIBUTED): return scalar scalar_t = torch.tensor( scalar, dtype=torch.float, device=device) / torch.distributed.get_world_size() torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) return scalar_t.item() optimizer = AdamW(model.parameters(), lr=6.25e-5) ds = dataloader.Conv_GPT2_DataClass(tokenizer) v_ds = dataloader.Conv_GPT2_DataClass(tokenizer, dev=True) orig_added_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens( dataloader.ATTR_SPECIAL_TOKENS) if (num_added_tokens > 0): model.resize_token_embeddings(new_num_tokens=orig_added_tokens + num_added_tokens) model = model.to(device) train_sampler = torch.utils.data.distributed.DistributedSampler( ds) if DISTRIBUTED else None valid_sampler = torch.utils.data.distributed.DistributedSampler( v_ds) if DISTRIBUTED else None dl = DataLoader(ds, sampler=train_sampler, batch_size=BATCH_SIZE, shuffle=not DISTRIBUTED) v_dl = DataLoader(v_ds, sampler=valid_sampler, shuffle=False) metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"]), }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) def update(engine, batch): model.train() batch = tuple(t.to(device) for t in batch) lm_loss, *_ = model(batch[0], token_type_ids=batch[1], lm_labels=batch[2]) loss = lm_loss / ITERATION_STEP loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if engine.state.iteration % ITERATION_STEP == 0: optimizer.step() optimizer.zero_grad() return loss.item() def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(t.to(device) for t in batch) input_ids, token_type_ids, lm_labels = batch model_outputs = model(input_ids, token_type_ids=token_type_ids) lm_logits = model_outputs[0] lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted trainer = Engine(update) evaluator = Engine(inference) scheduler = PiecewiseLinear(optimizer, "lr", [(0, 6.25e-5), (EPOCHS * len(ds) // BATCH_SIZE, 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(v_dl)) if DISTRIBUTED: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) #evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") for name, metric in metrics.items(): metric.attach(evaluator, name) if (args.local_rank in [0, -1]): pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) #evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir='./logs') tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) #tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint('./checkpoint', '_checkpoint', n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'gpt2_qg': getattr(model, 'module', model)}) getattr(model, 'module', model).config.to_json_file( os.path.join('./checkpoint', 'config')) tokenizer.save_pretrained('./checkpoint') trainer.run(dl, max_epochs=EPOCHS) if (args.local_rank in [0, -1]): tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" in args.model_checkpoint else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(args.device) optimizer = OpenAIAdam(model.parameters(), lr=args.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) lm_loss, mc_loss = model(*batch) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default='wikitext-2', help="One of ('wikitext-103', 'wikitext-2') or a dict of splits paths." ) parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--embed_dim", type=int, default=410, help="Embeddings dim") parser.add_argument("--hidden_dim", type=int, default=2100, help="Hidden dimension") parser.add_argument("--num_max_positions", type=int, default=256, help="Max input length") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=16, help="NUmber of layers") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--initializer_range", type=float, default=0.02, help="Dropout") parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=8, help="Batch size for validation") parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--n_epochs", type=int, default=200, help="Number of training epochs") parser.add_argument("--n_warmup", type=float, default=1000, help="Number of warmup iterations") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradient") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat( args)) # This is a logger.info: only printed on the first process # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, model and optimizer") tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Let's use a pre-defined tokenizer args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] logits, loss = model(batch, labels=batch) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] logits = model(batch) shift_logits = logits[:-1].view(-1, logits.size(-1)) shift_labels = batch[1:].view(-1) return shift_logits, shift_labels evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) @evaluator.on(Events.COMPLETED) # Log evaluator metrics on tensorboard def tb_log_metrics(engine): for name in metrics.keys(): tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()