def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, restore_from, crash_iteration=1000): train_loader, val_loader = get_data_loaders( train_batch_size, val_batch_size) model = Net() device = 'cpu' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer( model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'nll': Loss(F.nll_loss)}, device=device) # Setup debug level of engine logger: trainer._logger.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s|%(name)s|%(levelname)s| %(message)s") ch.setFormatter(formatter) trainer._logger.addHandler(ch) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) if engine.state.iteration == crash_iteration: raise Exception("STOP at {}".format(engine.state.iteration)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Training Results - Epoch: {}\ Avg accuracy: {:.2f} Avg loss: {:.2f}".format( engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Validation Results - Epoch: {}\ Avg accuracy: {:.2f} Avg loss: {:.2f}".format( engine.state.epoch, avg_accuracy, avg_nll)) objects_to_checkpoint = {"model": model, "optimizer": optimizer} engine_checkpoint = ModelCheckpoint( dirname="engine_checkpoint", filename_prefix='ignite_checking', require_empty=False, save_interval=100) trainer.add_event_handler( Events.ITERATION_COMPLETED, engine_checkpoint, objects_to_checkpoint) if restore_from == "": trainer.run(train_loader, max_epochs=epochs) else: raise NotImplementedError('Not implemented yet')
def fit(self, model, optimizer, train_loader, val_loader, epochs, batch_size): print('starting fit()') # self.criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): # print('using model.cuda()') torch.cuda.device(self.device) model.cuda() ### print('$%$%$%$ in fit(), calling summary() with model.input_shape =', model.input_shape) ### summary(model, model.input_shape, device=self.device_str) trainer = create_supervised_trainer(model, optimizer, self.criterion, device=self.device_str) metrics = { 'accuracy': Accuracy(), 'nll': Loss(self.criterion), 'cm': ConfusionMatrix(num_classes=len(self.classLabels)) } training_history = {'accuracy': [], 'loss': []} validation_history = {'accuracy': [], 'loss': []} # last_epoch = [] evaluator = create_supervised_evaluator(model, metrics=metrics, device=self.device_str) def score_function(engine): val_loss = engine.state.metrics['nll'] return -val_loss if self.torch_patience > 0: early_stopping = EarlyStopping(patience=self.torch_patience, score_function=score_function, trainer=trainer) evaluator.add_event_handler(Events.COMPLETED, early_stopping) # self.writer = SummaryWriter(log_dir=self.weight_dir) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): model.eval() # print('# running log_training_results(trainer):') evaluator.run(train_loader) metrics = evaluator.state.metrics loss = metrics['nll'] accuracy = metrics['accuracy'] # self.writer.add_scalar('Loss/train', loss, self.n_iter) # self.writer.add_scalar('Accuracy/train', accuracy, self.n_iter) # last_epoch.append(0) training_history['accuracy'].append(accuracy) training_history['loss'].append(loss) print( "Training - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}". format(trainer.state.epoch, accuracy, loss)) model.train() @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): model.eval() # print('# running log_validation_results(trainer):') evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['nll'] accuracy = metrics['accuracy'] # self.writer.add_scalar('Loss/test', loss, self.n_iter) # self.writer.add_scalar('Accuracy/test', accuracy, self.n_iter) validation_history['accuracy'].append(accuracy) validation_history['loss'].append(loss) print( "Validation Results - Epoch: {} Avg val accuracy: {:.4f} Avg val loss: {:.4f}" .format(trainer.state.epoch, accuracy, loss)) # save the model with the best accuracy if self.best_accuracy < accuracy: if not isdir(self.weight_dir): mkdir(self.weight_dir) torch.save(model.state_dict(), self.weight_path_best) print('--> At Epoch: ', trainer.state.epoch, ', saved to ', self.weight_path_best, sep='') self.model = model self.best_accuracy = accuracy model.train() checkpointer = ModelCheckpoint(self.weight_dir, 'modelCheckpoint', save_interval=1, n_saved=2, create_dir=True, save_as_state_dict=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'epoch': model}) # self.n_iter = 0 print('before trainer.run()') trainer.run(train_loader, max_epochs=self.deep_epochs)
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument( "--data_faiss", type=str, default="data_persona_faiss_fase1_opcion4", help= "list of the personalities selected with faiss according to the strategy selected" ) args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch output_loss = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=lm_labels) loss = (output_loss.loss * args.lm_coef + output_loss.mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses output_gpt = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = output_gpt.logits[ ..., :-1, :].contiguous().view(-1, output_gpt.logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, output_gpt.mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def _start(self): """Method to run the task """ if 'cuda' in self.device: self.model = self.model.to(self.device) mlflow.log_param("model", get_object_name(self.model)) self.logger.debug("Setup criterion") if "cuda" in self.device: self.criterion = self.criterion.to(self.device) mlflow.log_param("criterion", get_object_name(self.criterion)) mlflow.log_param("optimizer", get_object_name(self.optimizer)) self.logger.debug("Setup ignite trainer") trainer = self._setup_trainer() self._setup_trainer_handlers(trainer) metrics = {'loss': Loss(self.criterion)} metrics.update(self.metrics) self.logger.debug("Input data info: ") msg = "- train data loader: {} number of batches".format( len(self.train_dataloader)) if isinstance(self.train_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.train_dataloader.sampler)) self.logger.debug(msg) if isinstance(self.train_dataloader, DataLoader): write_model_graph(self.writer, model=self.model, data_loader=self.train_dataloader, device=self.device) self.pbar_eval = None if self.train_eval_dataloader is not None: self.pbar_eval = ProgressBar() self._setup_offline_train_metrics_computation(trainer, metrics) if self.val_dataloader is not None: if self.val_metrics is None: self.val_metrics = metrics if self.pbar_eval is None: self.pbar_eval = ProgressBar() val_evaluator = self._setup_val_metrics_computation(trainer) if self.reduce_lr_on_plateau is not None: assert self.reduce_lr_on_plateau_var in self.val_metrics, \ "Monitor variable {} is not found in metrics {}" \ .format(self.reduce_lr_on_plateau_var, metrics) @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_var = engine.state.metrics[ self.reduce_lr_on_plateau_var] self.reduce_lr_on_plateau.step(val_var) def default_score_function(engine): val_loss = engine.state.metrics['loss'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if self.early_stopping_kwargs is not None: if 'score_function' in self.early_stopping_kwargs: es_score_function = self.early_stopping_kwargs[ 'score_function'] else: es_score_function = default_score_function self._setup_early_stopping(trainer, val_evaluator, es_score_function) # Setup model checkpoint: if self.model_checkpoint_kwargs is None: self.model_checkpoint_kwargs = { "filename_prefix": "model", "score_name": "val_loss", "score_function": default_score_function, "n_saved": 3, "atomic": True, "create_dir": True, "save_as_state_dict": True } self._setup_best_model_checkpointing(val_evaluator) self.logger.debug("Setup other handlers") if self.lr_scheduler is not None: @trainer.on(Events.ITERATION_STARTED) def update_lr_scheduler(engine): self.lr_scheduler.step() self._setup_log_learning_rate(trainer) self.logger.info("Start training: {} epochs".format(self.num_epochs)) mlflow.log_param("num_epochs", self.num_epochs) trainer.run(self.train_dataloader, max_epochs=self.num_epochs) self.logger.debug("Training is ended")
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info(f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def run_classification(model, train_loader, val_loader, epochs, early_stopping, lr, momentum, log_interval, experiment_name, continueing=False): device = 'cpu' if torch.cuda.is_available(): device = 'cuda' writer = SummaryWriter(here / f'tensorboard/runs_{experiment_name}') data_loader_iter = iter(train_loader) x, y = next(data_loader_iter) writer.add_graph(model, x) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) #optimizer = torch.optim.Adam(model.parameters(), lr=lr) start_epoch = 0 start_best_accuracy = 0.0 if continueing: model, optimizer, start_epoch, start_best_accuracy = _load_checkpoint( model, optimizer, experiment_name) model.train( ) # In case the model was saved after a test loop where model.eval() was called evaluator = create_supervised_evaluator(model, device=device, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }) evaluator_val = create_supervised_evaluator(model, device=device, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }) trainer = create_supervised_trainer(model, optimizer, nn.NLLLoss(), device=device) desc = 'ITERATION - loss: {:.4f}' progress_bar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.STARTED) def init(engine): engine.state.epoch = start_epoch engine.state.best_accuracy = start_best_accuracy # One iteration = one batch @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: progress_bar.desc = desc.format(engine.state.output) progress_bar.update(log_interval) writer.add_scalar('training/loss', engine.state.output, engine.state.iteration) @trainer.on(Events.ITERATION_COMPLETED) def log_gradients(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: for n, p in model.named_parameters(): if p.requires_grad: writer.add_scalar(f'{n}/gradient', p.grad.abs().mean(), engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): progress_bar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] logger.info( 'Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}' .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar('training/avg_loss', avg_nll, engine.state.epoch) writer.add_scalar('training/avg_accuracy', avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results_and_save(engine): evaluator_val.run(val_loader) metrics = evaluator_val.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] logger.info( 'Validation Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}' .format(engine.state.epoch, avg_accuracy, avg_nll)) progress_bar.n = progress_bar.last_print_n = 0 writer.add_scalar('valdation/avg_loss', avg_nll, engine.state.epoch) writer.add_scalar('valdation/avg_accuracy', avg_accuracy, engine.state.epoch) # Save the model every epoch. If it's the best seen so far, save it separately torch.save( { 'epoch': engine.state.epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'accuracy': avg_accuracy, 'best_accuracy': engine.state.best_accuracy, 'loss': avg_nll }, f'model_latest_{experiment_name}.pt') if avg_accuracy > engine.state.best_accuracy: engine.state.best_accuracy = avg_accuracy shutil.copyfile(f'model_latest_{experiment_name}.pt', f'model_best_{experiment_name}.pt') # Early stopping handler = EarlyStopping( patience=early_stopping, score_function=(lambda engine: -evaluator_val.state.metrics['nll']), trainer=trainer) evaluator_val.add_event_handler(Events.COMPLETED, handler) trainer.run(train_loader, max_epochs=epochs) progress_bar.close() writer.close()
trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args,args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
def __init__(self, optimizer: OptimizerType, train_loader: DataLoaderType, model: torch.nn.Module, train_engine: Optional[ignite.engine.Engine] = None, test_engine: Optional[ignite.engine.Engine] = None, test_loader: Optional[DataLoaderType] = None, loss_fn: Optional[LossFnType] = None, eval_metric: Optional[ignite.metrics.Metric] = None, descending: bool = True, device: str = 'cuda') -> None: super().__init__() self.descending = descending self.optimizer: OptimizerType = optimizer self.model: Optional[torch.nn.Module] = model self.train_engine: ignite.engine.Engine self.train_loader: DataLoaderType = train_loader self.test_loader: Optional[DataLoaderType] = test_loader self.test_engine: Optional[ignite.engine.Engine] # create the train engine if necessary # if so, build it from the model and loss_fn if train_engine is None and model is None: raise TypeError('either train_engine or model have to be provided') if train_engine is not None: self.train_engine = train_engine # directly use it elif model is not None: if loss_fn is None: raise TypeError( 'loss_fn has to be provided if passing a plain pytorch model' ) self.train_engine = ignite.engine.create_supervised_trainer( model, optimizer, loss_fn=loss_fn, device=device, non_blocking=True) # get the metric to use new_metric = None if eval_metric is not None: new_metric = eval_metric elif loss_fn is not None: # use the given eval_metric if provided, but fallback # to using the loss averaged over the entire epoch new_metric = Loss(loss_fn) # if the test loader is present, then we need an engine for training if test_loader is not None: # test engine is needed only if we have a test loader if test_engine is None: if eval_metric is None: if loss_fn is None: # error if no metric or loss_fn raise TypeError( 'loss_fn has to be provided if using the default evaluator and not ' 'providing a metric') if model is None: raise TypeError( 'model must be provided if using the default evaluator' ) # create a default test engine self.test_engine = ignite.engine.create_supervised_evaluator( model, metrics={'loss': new_metric}, device=device, non_blocking=True) else: self.test_engine = test_engine # use the specified engine # attach a new metric if present if new_metric is not None: new_metric.attach(self.test_engine, 'loss') else: self.test_engine = None # no need for a test engine if no test loader specified
def run( train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir, checkpoint_every, resume_from, crash_iteration=-1, deterministic=False, ): # Setup seed to have same model's initialization: manual_seed(75) train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(log_dir=log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer criterion = nn.NLLLoss() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5) # Setup trainer and evaluator if deterministic: tqdm.write("Setup deterministic trainer") trainer = create_supervised_trainer( model, optimizer, criterion, device=device, deterministic=deterministic ) evaluator = create_supervised_evaluator( model, metrics={"accuracy": Accuracy(), "nll": Loss(criterion)}, device=device ) # Apply learning rate scheduling @trainer.on(Events.EPOCH_COMPLETED) def lr_step(engine): lr_scheduler.step() pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=f"Epoch {0} - loss: {0:.4f} - lr: {lr:.4f}", ) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): lr_ = optimizer.param_groups[0]["lr"] pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr_:.4f}" pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) writer.add_scalar("lr", lr_, engine.state.iteration) if crash_iteration > 0: @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration)) def _(engine): raise Exception(f"STOP at {engine.state.iteration}") if resume_from is not None: @trainer.on(Events.STARTED) def _(engine): pbar.n = engine.state.iteration % engine.state.epoch_length @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) # Compute and log validation metrics @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) # Setup object to checkpoint objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, } training_checkpoint = Checkpoint( to_save=objects_to_checkpoint, save_handler=DiskSaver(log_dir, require_empty=False), n_saved=None, global_step_transform=lambda *_: trainer.state.epoch, ) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=checkpoint_every), training_checkpoint) # Setup logger to print and dump into file: model weights, model grads and data stats # - first 3 iterations # - 4 iterations after checkpointing # This helps to compare resumed training with checkpointed training def log_event_filter(e_, event): if event in [1, 2, 3]: return True elif 0 <= (event % (checkpoint_every * e_.state.epoch_length)) < 5: return True return False fp = Path(log_dir) / ("run.log" if resume_from is None else "resume_run.log") fp = fp.as_posix() for h in [log_data_stats, log_model_weights, log_model_grads]: trainer.add_event_handler( Events.ITERATION_COMPLETED(event_filter=log_event_filter), h, model=model, fp=fp ) if resume_from is not None: tqdm.write(f"Resume from the checkpoint: {resume_from}") checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) try: # Synchronize random states manual_seed(15) trainer.run(train_loader, max_epochs=epochs) except Exception as e: import traceback print(traceback.format_exc()) pbar.close() writer.close()
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ # Configuration: update_config(config, options=options, config_file=cfg) # The model will be saved under: outputs/<config_file_name>/<model_dir> config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) # Logging: load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) # Set CUDNN benchmark mode: torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK # We will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Fix random seeds: torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Augmentation: basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug # Training and Validation Loaders: TrainPatchLoader = get_patch_loader(config) logging.info(f"Using {TrainPatchLoader}") train_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="train", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, debug=debug, ) logger.info(train_set) n_classes = train_set.n_classes val_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="val", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=val_aug, debug=debug, ) logger.info(val_set) if debug: logger.info("Running in debug mode..") train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES)) val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1 ) # config.WORKERS) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Optimizer and LR Scheduler: optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # Tensorboard writer: summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs")) # class weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") # Ignite trainer and evaluator: trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=transform_fn), "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device), "cacc": class_accuracy(n_classes, output_transform=transform_fn), "mca": mean_class_accuracy(n_classes, output_transform=transform_fn), "ciou": class_iou(n_classes, output_transform=transform_fn), "mIoU": mean_iou(n_classes, output_transform=transform_fn), }, device=device, ) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Logging: trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer)) # Tensorboard and Logging: trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) # add specific logger which also triggers printed metrics on training set @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") logging_handlers.log_metrics(engine, evaluator, stage="Training") # add specific logger which also triggers printed metrics on validation set @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") logging_handlers.log_metrics(engine, evaluator, stage="Validation") # dump validation set metrics at the very end for debugging purposes if engine.state.epoch == config.TRAIN.END_EPOCH and debug: fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" metrics = evaluator.state.metrics out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]} with open(fname, "w") as fid: json.dump(out_dict, fid) log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) logging.info(log_msg) # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), lambda: (trainer.state.iteration % snapshot_duration) == 0, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) summary_writer.close()
model = models.alexnet(pretrained=True) model = model.to(device) loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) # --------------------------------- Training ---------------------------------- # Set up pytorch-ignite trainer and evaluator. trainer = create_supervised_trainer( model, optimizer, loss_func, device=device, ) metrics = { "accuracy": Accuracy(), "loss": Loss(loss_func), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) @trainer.on(Events.ITERATION_COMPLETED(every=print_every)) def log_batch(trainer): batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1 print(f"Epoch {trainer.state.epoch} / {num_epochs}, " f"batch {batch} / {trainer.state.epoch_length}: " f"loss: {trainer.state.output:.3f}") @trainer.on(Events.EPOCH_COMPLETED) def log_epoch(trainer): print(f"Epoch {trainer.state.epoch} / {num_epochs} average results: ")
if args.loss_fn == "MSE": loss_fn = nn.MSELoss(reduction='sum').to(device) print("use MSELoss") elif args.loss_fn == "L1": loss_fn = nn.L1Loss(reduction='sum').to(device) print("use L1Loss") optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.decay) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) evaluator_train = create_supervised_evaluator(model, metrics={ 'mae': CrowdCountingMeanAbsoluteError(), 'mse': CrowdCountingMeanSquaredError(), 'loss': Loss(loss_fn) }, device=device) evaluator_validate = create_supervised_evaluator(model, metrics={ 'mae': CrowdCountingMeanAbsoluteError(), 'mse': CrowdCountingMeanSquaredError(), 'loss': Loss(loss_fn) }, device=device) print(model) print(args) # timer train_timer = Timer(average=True) # time to train whole epoch
def test_step(engine, batch): global model, g with th.no_grad(): model.eval() model = model.to(gpu) g = g.to(gpu) (idx, ) = [x.to(gpu) for x in batch] y_pred = model(g, idx) y_true = g.ndata['label'][idx] return y_pred, y_true evaluator = Engine(test_step) metrics = {'acc': Accuracy(), 'nll': Loss(th.nn.NLLLoss())} for n, f in metrics.items(): f.attach(evaluator, n) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(idx_loader_train) metrics = evaluator.state.metrics train_acc, train_nll = metrics["acc"], metrics["nll"] evaluator.run(idx_loader_val) metrics = evaluator.state.metrics val_acc, val_nll = metrics["acc"], metrics["nll"] evaluator.run(idx_loader_test) metrics = evaluator.state.metrics test_acc, test_nll = metrics["acc"], metrics["nll"]
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) trainer.logger = setup_logger("trainer") evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(F.nll_loss) }, device=device) evaluator.logger = setup_logger("evaluator") desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(engine): tqdm.write("{} took {} seconds".format( trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name])) trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n' + json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders( tb, vb) # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb) weights = (1 / num_of_images) / ((1 / num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained( 'efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics class DOCLoss(nn.Module): def __init__(self, weight): super(DOCLoss, self).__init__() self.class_weights = weight def forward(self, input, target): sigmoid = 1 - 1 / (1 + torch.exp(-input)) sigmoid[range(0, sigmoid.shape[0]), target] = 1 - sigmoid[range(0, sigmoid.shape[0]), target] sigmoid = torch.log(sigmoid) if self.class_weights is not None: loss = -torch.sum(sigmoid * self.class_weights) else: loss = -torch.sum(sigmoid) return loss train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(DOCLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } def val_pred_transform(output): y_pred, y = output new_y_pred = torch.zeros( (y_pred.shape[0], INFO['dataset-info']['num-of-classes'] + 1)).to(device=device) for ind, c in enumerate(train_loader.dataset.classes): new_col = val_loader.dataset.class_to_idx[c] new_y_pred[:, new_col] += y_pred[:, ind] ukn_ind = val_loader.dataset.class_to_idx['UNKNOWN'] import math new_y_pred[:, ukn_ind] = -math.inf return new_y_pred, y val_metrics = { 'accuracy': Accuracy(), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(val_pred_transform), Recall(val_pred_transform), val_loader.dataset.classes), 'cmatrix': MetricsLambda( CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes'] + 1, output_transform=val_pred_transform), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, DOCLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) # Compute metrics on train data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() print('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, avg_loss, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # Compute metrics on val data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): print('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Validating Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars( 'Aggregate/Score', { 'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1] }, engine.state.epoch) pbar.n = pbar.last_print_n = 0 # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=50, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def test_zero_div(): loss = Loss(nll_loss) with pytest.raises(NotComputableError): loss.compute()
batch_size=25, shuffle=True) val_loader = data.DataLoader(TGSSaltDataset(validate_images, validate_masks), batch_size=50, shuffle=False) learning_rate = 1e-4 loss_fn = torch.nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) trainer = create_supervised_trainer(model, optimizer, loss_fn, device="cuda") evaluator = create_supervised_evaluator(model, device="cuda", metrics={ 'accuracy': BinaryAccuracy(), 'my_loss': Loss(loss_fn) }) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): #print("Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output)) pass @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics print( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}".
def train(prefix, epochs, batch_size, num_workers, embedding_size, num_layers, learning_rate, weight_decay, model_dir, run_dir): ts = datetime.now().strftime("%m_%d_%Y__%H_%M") run_name = '{}_{}'.format(prefix, ts) model_dir = os.path.join(model_dir, run_name) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') writer = SummaryWriter(os.path.join(run_dir, run_name)) ds_train = TriggeredEarthquake( mode=DatasetMode.TRAIN, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE) ds_test = TriggeredEarthquake( mode=DatasetMode.TEST, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, transform=triggered_earthquake_transform(random_trim_offset=False)) # ds_train = SiameseDataset(ds_train) train_loader = DataLoader(ds_train, batch_size=batch_size, num_workers=num_workers, shuffle=True) test_loader = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers, shuffle=True) model = DilatedConvolutional(embedding_size=embedding_size, num_layers=num_layers) params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate, weight_decay=weight_decay) loss_fn = DeepClusteringLoss() trainer = create_engine(model, optimizer, loss_fn, device) evaluator = create_eval(model, {'dcl': Loss(loss_fn)}, device) summary( model, (1, gin.query_parameter('triggered_earthquake_transform.target_length'))) writer.add_graph(model, next(iter(train_loader))[0].unsqueeze(1).to(device)) save_handler = ModelCheckpoint(model_dir, prefix, n_saved=1, create_dir=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_handler, {'model': model}) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(_): """ report training loss :param _: :return: """ writer.add_scalar('Iter/train_loss', trainer.state.output, trainer.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(_): """ report training loss :param _: :return: """ evaluator.run(train_loader) loss = trainer.state.output writer.add_scalar('Loss/train', loss, trainer.state.epoch) print("Training Results - Epoch: {} Avg loss: {:.2f}".format( trainer.state.epoch, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def test_acc(_): """ report testing accurarcy :param _: :return: """ acc, cm, _, = test_classification( model, gin.query_parameter('triggered_earthquake_dataset.testing_quakes'), device, gin.query_parameter('triggered_earthquake_dataset.data_dir')) writer.add_scalar('Accurarcy/test', acc, trainer.state.epoch) print('Testing Accurarcy: {:.2f}'.format(acc)) print(cm) def report_embeddings(_): """ write embeddings to tensorboard :param _: :return: """ train_loader = DataLoader(ds_train, batch_size=1) test_loader = DataLoader(ds_test, batch_size=1) text_labels = gin.query_parameter( 'triggered_earthquake_dataset.labels') train_embeddings, train_labels = get_embeddings(model, train_loader, device=device) train_labels = [ text_labels[np.argmax(l)] for l in train_labels.squeeze(1) ] writer.add_embedding(train_embeddings.squeeze(1), metadata=train_labels, global_step=trainer.state.epoch, tag='train_embeddings') test_embeddings, test_labels = get_embeddings(model, test_loader, device=device) test_labels = [ text_labels[np.argmax(l)] for l in test_labels.squeeze(1) ] writer.add_embedding(test_embeddings.squeeze(1), metadata=test_labels, global_step=trainer.state.epoch, tag='test_embeddings') trainer.add_event_handler(Events.EPOCH_COMPLETED(once=1), report_embeddings) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=5), report_embeddings) @trainer.on(Events.COMPLETED) def save_classifier(_): ''' create and save two svc classifiers in the model_dir - one with only training data - ont with all data :param _: :return: ''' # save classifier only trained on training data _, _, classifier = test_classification( model, gin.query_parameter('triggered_earthquake_dataset.testing_quakes'), device, gin.query_parameter('triggered_earthquake_dataset.data_dir')) with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier, f) # save classifier trained on all data (for running inference) ds = TriggeredEarthquake( data_dir=gin.query_parameter( 'triggered_earthquake_dataset.data_dir'), testing_quakes=[], downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, mode=DatasetMode.INFERENCE, transform=triggered_earthquake_transform(random_trim_offset=False), ) loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True) classifier_alldata = create_classifier(model, loader, type='svc', device=device) with open( os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier_alldata, f) @trainer.on(Events.COMPLETED) def save_metadata(_): ''' save a metadata file, used for inference :param _: :return: ''' transformer = triggered_earthquake_transform(random_trim_offset=False) transformer_path = os.path.join(model_dir, 'transformer.p') pickle.dump(transformer, open(transformer_path, 'wb')) metadata = { 'name': run_name, 'classes': gin.query_parameter('triggered_earthquake_dataset.labels'), 'model_state_path': save_handler.last_checkpoint, 'classifier_path': os.path.join(model_dir, '{}_classifier.p'.format(prefix)), 'embedding_size': embedding_size, 'num_layers': num_layers, 'transformer': transformer_path } with open(os.path.join(model_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f) trainer.run(train_loader, max_epochs=epochs) writer.close()
def train_with_ignite(networks, dataset, data_dir, batch_size, img_size, epochs, lr, momentum, num_workers, optimizer, logger): from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator from ignite.metrics import Loss from utils.metrics import MultiThresholdMeasures, Accuracy, IoU, F1score # device device = 'cuda' if torch.cuda.is_available() else 'cpu' # build model model = get_network(networks) # log model summary input_size = (3, img_size, img_size) summarize_model(model.to(device), input_size, logger, batch_size, device) # build loss loss = torch.nn.BCEWithLogitsLoss() # build optimizer and scheduler model_optimizer = get_optimizer(optimizer, model, lr, momentum) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer) # transforms on both image and mask train_joint_transforms = jnt_trnsf.Compose([ jnt_trnsf.RandomCrop(img_size), jnt_trnsf.RandomRotate(5), jnt_trnsf.RandomHorizontallyFlip() ]) # transforms only on images train_image_transforms = std_trnsf.Compose([ std_trnsf.ColorJitter(0.05, 0.05, 0.05, 0.05), std_trnsf.ToTensor(), std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) test_joint_transforms = jnt_trnsf.Compose([jnt_trnsf.Safe32Padding()]) test_image_transforms = std_trnsf.Compose([ std_trnsf.ToTensor(), std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # transforms only on mask mask_transforms = std_trnsf.Compose([std_trnsf.ToTensor()]) # build train / test loader train_loader = get_loader(dataset=dataset, data_dir=data_dir, train=True, joint_transforms=train_joint_transforms, image_transforms=train_image_transforms, mask_transforms=mask_transforms, batch_size=batch_size, shuffle=False, num_workers=num_workers) test_loader = get_loader(dataset=dataset, data_dir=data_dir, train=False, joint_transforms=test_joint_transforms, image_transforms=test_image_transforms, mask_transforms=mask_transforms, batch_size=1, shuffle=False, num_workers=num_workers) # build trainer / evaluator with ignite trainer = create_supervised_trainer(model, model_optimizer, loss, device=device) measure = MultiThresholdMeasures() evaluator = create_supervised_evaluator(model, metrics={ '': measure, 'pix-acc': Accuracy(measure), 'iou': IoU(measure), 'loss': Loss(loss), 'f1': F1score(measure), }, device=device) # initialize state variable for checkpoint state = update_state(model.state_dict(), 0, 0, 0, 0, 0) # make ckpt path ckpt_root = './ckpt/' filename = '{network}_{optimizer}_lr_{lr}_epoch_{epoch}.pth' ckpt_path = os.path.join(ckpt_root, filename) # execution after every training iteration @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): num_iter = (trainer.state.iteration - 1) % len(train_loader) + 1 if num_iter % 20 == 0: logger.info("Epoch[{}] Iter[{:03d}] Loss: {:.2f}".format( trainer.state.epoch, num_iter, trainer.state.output)) # execution after every training epoch @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): # evaluate on training set evaluator.run(train_loader) metrics = evaluator.state.metrics logger.info( "Training Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IoU: {}\n F1: {}\n" .format(trainer.state.epoch, metrics['loss'], str(metrics['pix-acc']), str(metrics['iou']), str(metrics['f1']))) # update state update_state(weight=model.state_dict(), train_loss=metrics['loss'], val_loss=state['val_loss'], val_pix_acc=state['val_pix_acc'], val_iou=state['val_iou'], val_f1=state['val_f1']) # execution after every epoch @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): # evaluate test(validation) set evaluator.run(test_loader) metrics = evaluator.state.metrics logger.info( "Validation Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IoU: {}\n F1: {}\n" .format(trainer.state.epoch, metrics['loss'], str(metrics['pix-acc']), str(metrics['iou']), str(metrics['f1']))) # update scheduler lr_scheduler.step(metrics['loss']) # update and save state update_state(weight=model.state_dict(), train_loss=state['train_loss'], val_loss=metrics['loss'], val_pix_acc=metrics['pix-acc'], val_iou=metrics['iou'], val_f1=metrics['f1']) path = ckpt_path.format(network=networks, optimizer=optimizer, lr=lr, epoch=trainer.state.epoch) save_ckpt_file(path, state) trainer.run(train_loader, max_epochs=epochs)
collate_fn=collate_fn) val_loader = DataLoader(dataset, batch_size=4, sampler=val_sampler, drop_last=False, collate_fn=collate_fn) #bertmodel = BertModel.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') for names, parameters in model.bert.named_parameters(): parameters.requiers_grad = False #optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3) optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False) criterion = nn.CrossEntropyLoss() metrics = {'loss': Loss(criterion), 'accuracy': Accuracy()} trainer = BertTrainer(model, optimizer, newbob_period=3, checkpoint_dir='./checkpoints/bert', metrics=metrics, non_blocking=True, retain_graph=True, patience=3, loss_fn=criterion, device=DEVICE, parallel=False) trainer.fit(train_loader, val_loader, epochs=10) trainer = BertTrainer(model, optimizer=None, checkpoint_dir='./checkpoints/bert',
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n' + json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders( tb, vb) weights = (1 / num_of_images) / ((1 / num_of_images).sum().item()) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained( 'efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(nn.CrossEntropyLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } def val_pred_transform(output): y_pred, y = output new_y_pred = torch.zeros( (y_pred.shape[0], len(INFO['dataset-info']['known-classes']) + 1)).to(device=device) for c in range(y_pred.shape[1]): if c == 0: new_y_pred[:, mapping[c]] += y_pred[:, c] elif mapping[c] == val_loader.dataset.class_to_idx['UNKNOWN']: new_y_pred[:, mapping[c]] = torch.where( new_y_pred[:, mapping[c]] > y_pred[:, c], new_y_pred[:, mapping[c]], y_pred[:, c]) else: new_y_pred[:, mapping[c]] += y_pred[:, c] return new_y_pred, y val_metrics = { 'accuracy': Accuracy(val_pred_transform), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(val_pred_transform), Recall(val_pred_transform), val_loader.dataset.classes), 'cmatrix': MetricsLambda( CMatrixTable, ConfusionMatrix(len(INFO['dataset-info']['known-classes']) + 1, output_transform=val_pred_transform), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, nn.CrossEntropyLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # ------------------------------------ # 7. Create event hooks @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() print('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ <Training> Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, avg_loss, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): print('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ <Validating> Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars( 'Aggregate/Score', { 'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1] }, engine.state.epoch) pbar.n = pbar.last_print_n = 0 trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def main(): set_seed(13) batch_size = 64 nb_epochs = 200 val_set_size = 0.2 print_report = True data_dir = Path('../data/head_classification_data/normed/') train_dir = data_dir.joinpath('data/') labels_filename = data_dir.joinpath('attention_norm_annotated.tsv') model_filename = '../models/head_classifier/classify_normed_patterns.tar' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # load data images, labels, label2id, min_max_size = load_data(train_dir, labels_filename) if val_set_size > 0: images_train, images_val, labels_train, labels_val = train_test_split( images, labels, test_size=val_set_size, stratify=labels) else: images_train, labels_train = images, labels images_val, labels_val = None, None print(f'Train: {images_train.shape} {labels_train.shape}') if labels_val is not None: print(f'Val: {images_val.shape}, {labels_val.shape}') dataset_train = torch.utils.data.TensorDataset( torch.from_numpy(images_train), torch.from_numpy(labels_train)) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True) if labels_val is not None: dataset_val = torch.utils.data.TensorDataset( torch.from_numpy(images_val), torch.from_numpy(labels_val)) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False) model = Net(len(label2id)) model = model.to(device) init_weights(model) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) criterion = torch.nn.CrossEntropyLoss() def update_function(engine, batch): model.train() optimizer.zero_grad() inputs, targets = [x.to(device) for x in batch] outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() return loss def inference_function(engine, batch): model.eval() with torch.no_grad(): inputs, targets = [x.to(device) for x in batch] outputs = model(inputs) return outputs, targets trainer = Engine(update_function) evaluator = Engine(inference_function) metrics = [ ('loss', Loss(torch.nn.CrossEntropyLoss())), ('accuracy', Accuracy()), ] for name, metric in metrics: metric.attach(evaluator, name) best_val_acc = 0 @trainer.on(Events.EPOCH_COMPLETED) def on_epoch_completed(engine): nonlocal best_val_acc evaluator.run(data_loader_train) metrics_train = format_metrics_str(evaluator.state.metrics) if labels_val is not None: evaluator.run(data_loader_val) metrics_val = format_metrics_str(evaluator.state.metrics) acc_val = evaluator.state.metrics['accuracy'] if acc_val >= best_val_acc: save_model(model_filename, model, label2id, min_max_size) best_val_acc = acc_val else: metrics_val = {} print( f'Epoch: {engine.state.epoch} | Train: {metrics_train} | Val: {metrics_val}' ) trainer.run(data_loader_train, max_epochs=nb_epochs) if labels_val is None: save_model(model_filename, model, label2id, min_max_size) else: print(f'Best val accuracy: {best_val_acc}') if print_report: print(f'Train classification report') plot_confusion_matrix(model, data_loader_train, device, label2id) print(f'Val classification report') if labels_val is not None: plot_confusion_matrix(model, data_loader_val, device, label2id)
train_set = NonterminalFeaturesDataset(os.path.join( args.data_dir, 'train')) train_loader = DataLoader(train_set, batch_size=None) val_set = NonterminalFeaturesDataset(os.path.join(args.data_dir, 'val')) val_loader = DataLoader(train_set, batch_size=None) gen_set = NonterminalFeaturesDataset(os.path.join(args.data_dir, 'gen')) gen_loader = DataLoader(train_set, batch_size=None) classifier = Classifier(args.features, args.nonterminals) classifier = classifier.to(device) optimizer = torch.optim.SGD(classifier.parameters(), lr=args.lr) # Trainer and metrics save_dict = {'classifier': classifier} trainer = Engine(step_train(classifier, optimizer)) metric_names = ['loss', 'accuracy'] RunningAverage(Loss(F.cross_entropy, lambda x: (x['y_pred'], x['y_true']))).attach(trainer, 'loss') RunningAverage(Accuracy(lambda x: (x['y_pred'], x['y_true']))).attach( trainer, 'accuracy') # Evaluator and metrics evaluator = Engine(step_train(classifier, None, train=False)) Accuracy(lambda x: (x['y_pred'], x['y_true'])).attach( evaluator, 'accuracy') # Begin training run(args.run_name, save_dict, metric_names, trainer, evaluator, train_loader, val_loader, gen_loader, args.epochs, 'accuracy')
def train(NN_index, trainsetsize, log, max_epoch): if_gpu = True if if_gpu: torch.set_default_tensor_type(torch.cuda.DoubleTensor if torch.cuda.is_available() else torch.DoubleTensor) device = "cuda:0" # print("Graphics Power!") else: torch.set_default_tensor_type(torch.DoubleTensor) device = None # file = 'grouprow.npy' # file = 'full_len.npy' file = 'data11.npy' # feat1A, feat1B, feat1C, feat2A, feat2B if os.path.isfile('traindata.npy'): newdata, elementdict, featperelem, datavariables, feattotal = generateData(file) print("loaded given datasets") train_data = np.load(open('traindata.npy', 'rb')) val_data = np.load(open('valdata.npy', 'rb')) else: # disable features in classes to gen new data newdata, elementdict, featperelem, datavariables, feattotal = generateData(file) # insert filename print("Shape of read data: ", newdata.shape) print("generating random files") create_dataset(newdata, trainsetsize) train_data = np.load(open('traindata.npy', 'rb')) val_data = np.load(open('valdata.npy', 'rb')) # newdata = znormalize(newdata) # train_data, val_data = getRandomSets(newdata) # now in create_dataset class mean, stnddev = get_mean_stndev(train_data) # normalization train_data = (train_data - mean) / stnddev val_data = (val_data - mean) / stnddev # welches???????????????????????? # val_data = (val_data[:, 1::] - mean[1::]) / stnddev[1::] # print("val data shape and ex:", val_data.shape, val_data[0]) train_set, val_set = PerovskiteDataset(train_data), PerovskiteDataset(val_data) # Variable batch and set loader train_batchsize = 1000 val_batchsize = 10000 # len(val_data) # 231472 # all or small like 2000 ? train_loader, val_loader = DataLoader(train_set, batch_size=train_batchsize, shuffle=True, drop_last=False), \ DataLoader(val_set, batch_size=val_batchsize, drop_last=True) # shuffle=True # model = get_NN(feattotal) model = get_CNN(feattotal) # Shape for saving netstucture modelform = str(model) # print("Type:", type(modelform)) # summary(netz, (1, train_batchsize, int(feattotal))) # channel, H ,W lossMAE = nn.L1Loss() # MAE # to ignite lossMSE = nn.MSELoss() # torch.optim.SGD(params, lr=0.01) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # list of trainers? trainer = create_supervised_trainer(model, optimizer, lossMAE, std=stnddev[0], prepare_batch=prepare_batch) # model[:] evaluator = create_supervised_evaluator(model, std=stnddev[0], prepare_batch=prepare_batch, metrics={'MAE': Loss(lossMAE), 'MSE': Loss(lossMSE), # 'accuracy': Accuracy(), ??? # 'NLL': Loss(lossNLL) }) # output_transform=output_retransform_znormalize) expects (x, pred, y) # Progressbar pbar = ignite.contrib.handlers.ProgressBar(persist=False) pbar.attach(trainer, output_transform=lambda x: {'MAE': x}) # Save n Load model_checkpoint = 'NN_' # NN_index = sys.argv[1] # log = 'active' logcount = 0 al_level = 0 while (os.access(log + "/run_" + str(logcount), os.F_OK) == True): # +str(NN_index) logcount += 1 while (os.access(log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level), os.F_OK) == True): al_level += 1 os.mkdir(log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level)) writer = SummaryWriter(log_dir=log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level)) # +"NN_1" ? declaration for multiple NN print("Run: ", (logcount - 1), "NN: ", NN_index, "AL: ", al_level, "len of trainset: ", len(train_data)) # , comment=modelform) # print("Modelform:", modelform) if (os.path.isfile(model_checkpoint + str(NN_index) + '.pt')): print("NN: ", NN_index, "loaded") checkpoint = torch.load(model_checkpoint + str(NN_index) + '.pt') # try to load only optimizer # model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print("model not loaded!") start = timeit.default_timer() @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): iteration = trainer.state.iteration writer.add_scalar('loss_vs_iteration', trainer.state.output, iteration) # writer.close() # generating mass of files @trainer.on(ignite.engine.Events.EPOCH_STARTED) def log_time(trainer): elapsed = round(timeit.default_timer() - start, 2) writer.add_scalar('time_vs_epoch', elapsed, trainer.state.epoch) epoch = trainer.state.epoch if trainer.state.epoch == 100: writer.add_text(str(logcount), "Netzstruktur: " + modelform) # writer.close() # generating mass of files @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): if (trainer.state.epoch % evaluate_every == 0): evaluator.run(train_loader) metrics = evaluator.state.metrics print(trainer.state.epoch) print("\nTraining:", metrics) writer.add_scalar('MAEvsEpoch_training', metrics["MAE"], trainer.state.epoch) evaluator.run(val_loader) metrics = evaluator.state.metrics print("Validation: ", metrics) writer.add_scalar('MAEvsEpoch_validation', metrics["MAE"], trainer.state.epoch) if trainer.state.epoch % evaluate_every == max_epoch: writer.close() evaluate_every = 100 trainer.run(train_loader, max_epochs=max_epoch) torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, model_checkpoint + str(NN_index) + '.pt') torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level) + "/" + model_checkpoint + str(NN_index) + '.pt')
elif TARGET == "dvd": pre = './sdvd' elif TARGET == "electronics": pre = './sele' else: pre = './skit' model = DoubleHeadBert.from_pretrained(pre) #for names, parameters in model.bert.named_parameters(): # parameters.requiers_grad=False optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False) in_fn = nn.CrossEntropyLoss() criterion = DoubleLoss(in_fn) metrics = { 'loss': Loss(criterion) #'accuracy': Accuracy(transform_pred_tar) } path = SOURCE + TARGET trainer = DoubleBertTrainer(model, optimizer, newbob_period=3, checkpoint_dir=os.path.join( './checkpoints/double', path), metrics=metrics, non_blocking=True, retain_graph=True, patience=3, accumulation_steps=5, loss_fn=criterion, device=DEVICE,
""" y_pred, y = output return torch.max(y_pred, dim=1)[1], y # attach running loss (will be displayed in progess bar) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') # attach running accuracy (will be displayed in progess bar) RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]])).attach( trainer, 'acc') # attach accuracy and loss to train_evaluator Accuracy(output_transform=max_output_transform).attach(train_evaluator, 'accuracy') Loss(loss_fn).attach(train_evaluator, 'bce') # attach accuracy and loss to validation_evaluator Accuracy(output_transform=max_output_transform).attach(validation_evaluator, 'accuracy') Loss(loss_fn).attach(validation_evaluator, 'bce') ############################################# # Report progress through tqdm progress bar # ############################################# pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss', 'acc']) # Log after each EPOCH @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine):
optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.decay) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) evaluator = create_supervised_evaluator( model, metrics={ 'mae': CrowdCountingMeanAbsoluteError(), 'mse': CrowdCountingMeanSquaredError(), 'nll': Loss(loss_fn) }, device=device) print(model) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): print("Epoch[{}] Interation [{}] Loss: {:.2f}".format( trainer.state.epoch, trainer.state.iteration, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics print(
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): vis = visdom.Visdom() # if not vis.check_connection(): # raise RuntimeError("Visdom server not running. Please run python -m visdom.server") train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) train_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Loss') train_avg_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Average Loss') train_avg_accuracy_window = create_plot_window( vis, '#Iterations', 'Accuracy', 'Training Average Accuracy') val_avg_loss_window = create_plot_window(vis, '#Epochs', 'Loss', 'Validation Average Loss') val_avg_accuracy_window = create_plot_window( vis, '#Epochs', 'Accuracy', 'Validation Average Accuracy') @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter, len(train_loader), engine.state.output)) vis.line(X=np.array([engine.state.iteration]), Y=np.array([engine.state.output]), update='append', win=train_loss_window) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=train_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=train_avg_loss_window, update='append') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]), win=val_avg_accuracy_window, update='append') vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]), win=val_avg_loss_window, update='append') # kick everything off trainer.run(train_loader, max_epochs=epochs)
batch_size = 64 lr = 1e-3 train_loader, val_loader = get_data_loaders(batch_size, batch_size) model = ConvNet() device = 'cuda' optimizer = optim.Adam(model.parameters(), lr=lr) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model=model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 pbar.desc = desc.format(engine.state.output) pbar.update(1)
def run( train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir, checkpoint_every, resume_from, crash_iteration=1000, ): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(log_dir=log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer criterion = nn.NLLLoss() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(criterion) }, device=device) @trainer.on(Events.EPOCH_COMPLETED) def lr_step(engine): lr_scheduler.step() desc = "ITERATION - loss: {:.4f} - lr: {:.4f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0, lr)) if log_interval is None: e = Events.ITERATION_COMPLETED log_interval = 1 else: e = Events.ITERATION_COMPLETED(every=log_interval) @trainer.on(e) def log_training_loss(engine): lr = optimizer.param_groups[0]["lr"] pbar.desc = desc.format(engine.state.output, lr) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) writer.add_scalar("lr", lr, engine.state.iteration) if resume_from is None: @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration)) def _(engine): raise Exception("STOP at {}".format(engine.state.iteration)) else: @trainer.on(Events.STARTED) def _(engine): pbar.n = engine.state.iteration @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler } training_checkpoint = Checkpoint(to_save=objects_to_checkpoint, save_handler=DiskSaver( log_dir, require_empty=False)) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=checkpoint_every), training_checkpoint) if resume_from is not None: tqdm.write("Resume from a checkpoint: {}".format(resume_from)) checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=epochs) except Exception as e: import traceback print(traceback.format_exc()) pbar.close() writer.close()