def _init_model_restoring_callbacks(self, initial_epoch, save_every_epoch): callbacks = [] best_checkpoint = ModelCheckpoint(self.best_checkpoint_filename, monitor=self.monitor_metric, mode=self.monitor_mode, save_best_only=not save_every_epoch, restore_best=not save_every_epoch, verbose=not save_every_epoch, temporary_filename=self.best_checkpoint_tmp_filename) callbacks.append(best_checkpoint) if save_every_epoch: best_restore = BestModelRestore(monitor=self.monitor_metric, mode=self.monitor_mode, verbose=True) callbacks.append(best_restore) if initial_epoch > 1: # We set the current best metric score in the ModelCheckpoint so that # it does not save checkpoint it would not have saved if the # optimization was not stopped. best_epoch_stats = self.get_best_epoch_stats() best_epoch = best_epoch_stats['epoch'].item() best_filename = self.best_checkpoint_filename.format(epoch=best_epoch) if not save_every_epoch: best_checkpoint.best_filename = best_filename best_checkpoint.current_best = best_epoch_stats[self.monitor_metric].item() else: best_restore.best_weights = torch.load(best_filename, map_location='cpu') best_restore.current_best = best_epoch_stats[self.monitor_metric].item() return callbacks
def test_restore_best_without_save_best_only(self): with self.assertRaises(ValueError): ModelCheckpoint(self.checkpoint_filename, monitor='val_loss', verbose=True, save_best_only=False, restore_best=True) with self.assertRaises(ValueError): ModelCheckpoint(self.checkpoint_filename, monitor='val_loss', verbose=True, restore_best=True)
def _callbacks(self): return [ EarlyStopping(patience=ANN_PATIENCE), ModelCheckpoint(filename=self.modelfile_save_path, save_best_only=True, restore_best=True) ]
def test_save_best_only(self): checkpointer = ModelCheckpoint(self.checkpoint_filename, monitor='val_loss', verbose=True, save_best_only=True) val_losses = [10, 3, 8, 5, 2] has_checkpoints = [True, True, False, False, True] self._test_checkpointer_with_val_losses(checkpointer, val_losses, has_checkpoints)
def test_periodic_with_period_of_2(self): checkpointer = ModelCheckpoint(self.checkpoint_filename, monitor='val_loss', verbose=True, period=2, save_best_only=False) val_losses = [1] * 10 has_checkpoints = [False, True] * 5 self._test_checkpointer_with_val_losses(checkpointer, val_losses, has_checkpoints)
def main(rnn_type, n_layers, dataset, embedding, device, save_path): train_iter, valid_iter, test_iter = dataset_factory(dataset, embedding=embedding) embedding_dim = int(embedding.split(".")[-1][:-1]) save_path = Path(save_path) / f"{rnn_type}_{n_layers}layer_{embedding_dim}" save_path.mkdir(parents=True, exist_ok=True) kwargs = dict( vocab_size=len(TEXT.vocab), embedding_dim=embedding_dim, hidden_dim=256, output_dim=1, n_layers=n_layers, dropout=0.5, pad_idx=TEXT.vocab.stoi[TEXT.pad_token], rnn_type="gru", ) with open(save_path / "kwargs.json", "w") as kwargs_file: json.dump(kwargs, kwargs_file) pretrained_embeddings = TEXT.vocab.vectors network = RNN(**kwargs) network.embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] network.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim) network.embedding.weight.data[PAD_IDX] = torch.zeros(embedding_dim) optimizer = torch.optim.Adam(network.parameters()) model = Model( network=network, optimizer=optimizer, loss_function=custom_loss, batch_metrics=[acc], ) model.to(device) history = model.fit_generator( train_generator=train_iter, valid_generator=valid_iter, epochs=10, callbacks=[ ModelCheckpoint( filename=str(save_path / "model.pkl"), save_best_only=True, restore_best=True, ) ], ) print(f"Model saved to {save_path}") __import__("pudb").set_trace() test_loss, test_acc, y_pred, y_true = model.evaluate_generator( generator=test_iter, return_pred=True, return_ground_truth=True ) print(f"Test Loss: {test_loss:.4f}, Test Binary Accuracy: {test_acc:.4f}")
def test_non_atomic_write(self): checkpoint_filename = os.path.join(self.temp_dir_obj.name, 'my_checkpoint.ckpt') train_gen = some_data_generator(ModelCheckpointTest.batch_size) valid_gen = some_data_generator(ModelCheckpointTest.batch_size) checkpointer = ModelCheckpoint(checkpoint_filename, monitor='val_loss', verbose=True, period=1, atomic_write=False) self.model.fit_generator(train_gen, valid_gen, epochs=10, steps_per_epoch=5, callbacks=[checkpointer]) self.assertTrue(os.path.isfile(checkpoint_filename))
def test_integration(self): train_gen = some_data_generator(ModelCheckpointTest.batch_size) valid_gen = some_data_generator(ModelCheckpointTest.batch_size) checkpointer = ModelCheckpoint(self.checkpoint_filename, monitor='val_loss', verbose=True, save_best_only=True) self.model.fit_generator(train_gen, valid_gen, epochs=10, steps_per_epoch=5, callbacks=[checkpointer])
def test_temporary_filename_arg_with_differing_checkpoint_filename(self): epochs = 10 tmp_filename = os.path.join(self.temp_dir_obj.name, 'my_checkpoint.tmp.ckpt') checkpoint_filename = os.path.join(self.temp_dir_obj.name, 'my_checkpoint_{epoch}.ckpt') train_gen = some_data_generator(ModelCheckpointTest.batch_size) valid_gen = some_data_generator(ModelCheckpointTest.batch_size) checkpointer = ModelCheckpoint(checkpoint_filename, monitor='val_loss', verbose=True, period=1, temporary_filename=tmp_filename) self.model.fit_generator(train_gen, valid_gen, epochs=epochs, steps_per_epoch=5, callbacks=[checkpointer]) self.assertFalse(os.path.isfile(tmp_filename)) for i in range(1, epochs + 1): self.assertTrue(os.path.isfile(checkpoint_filename.format(epoch=i)))
def fit(self, meta_train, meta_valid, meta_test=None, n_epochs=100, steps_per_epoch=100, log_filename=None, mse_filename=None, checkpoint_filename=None, tboard_folder=None, grads_inspect_dir=None, graph_flow_filename=None, do_early_stopping=True, mse_test=False, config=None): if hasattr(self.model, 'is_eval'): self.model.is_eval = False self.is_eval = False try: self.model.filename = log_filename[:-8] except: self.model.filename = 'test' self.steps_per_epoch = steps_per_epoch callbacks = [ReduceLROnPlateau(patience=2, factor=1 / 2, min_lr=1e-6, verbose=True), BestModelRestore(verbose=True)] if do_early_stopping: callbacks.append(EarlyStopping(patience=10, verbose=False)) if log_filename: callbacks.append(CSVLogger(log_filename, batch_granularity=False, separator='\t')) if mse_test: callbacks.append(MseMetaTest(meta_test=meta_test, filename=mse_filename, periodicity='epoch')) if checkpoint_filename: callbacks.append(ModelCheckpoint(checkpoint_filename, monitor='val_loss', save_best_only=True, temporary_filename=checkpoint_filename + 'temp')) if tboard_folder is not None: self.writer = SummaryWriter(tboard_folder) self.plain_writer = PlainWriter(tboard_folder) self.fit_generator(meta_train, meta_valid, epochs=n_epochs, steps_per_epoch=steps_per_epoch, validation_steps=steps_per_epoch, callbacks=callbacks, verbose=True) self.is_fitted = True if self.plain_writer is not None: self.plain_writer.close() return self
def train(self, train_loader, valid_loader=None, *, callbacks=[], lr_schedulers=[], save_every_epoch=False, disable_tensorboard=False, epochs=1000, steps_per_epoch=None, validation_steps=None, seed=42): if seed is not None: # Make training deterministic. random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # Copy callback list. callbacks = list(callbacks) tensorboard_writer = None initial_epoch = 1 if self.logging: if not os.path.exists(self.directory): os.makedirs(self.directory) # Restarting optimization if needed. initial_epoch = self._load_epoch_state(lr_schedulers) callbacks += [ CSVLogger(self.log_filename, separator='\t', append=initial_epoch != 1) ] callbacks += self._init_model_restoring_callbacks( initial_epoch, save_every_epoch) callbacks += [ ModelCheckpoint( self.model_checkpoint_filename, verbose=False, temporary_filename=self.model_checkpoint_tmp_filename) ] callbacks += [ OptimizerCheckpoint( self.optimizer_checkpoint_filename, verbose=False, temporary_filename=self.optimizer_checkpoint_tmp_filename) ] # We save the last epoch number after the end of the epoch so that the # _load_epoch_state() knows which epoch to restart the optimization. callbacks += [ PeriodicSaveLambda( lambda fd, epoch, logs: print(epoch, file=fd), self.epoch_filename, temporary_filename=self.epoch_tmp_filename, open_mode='w') ] tensorboard_writer, cb_list = self._init_tensorboard_callbacks( disable_tensorboard) callbacks += cb_list # This method returns callbacks that checkpoints the LR scheduler if logging is enabled. # Otherwise, it just returns the list of LR schedulers with a BestModelRestore callback. callbacks += self._init_lr_scheduler_callbacks(lr_schedulers) try: return self.model.fit_generator(train_loader, valid_loader, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, initial_epoch=initial_epoch, callbacks=callbacks) finally: if tensorboard_writer is not None: tensorboard_writer.close()
def run(self, constant_input, hidden_state): constant_input.requires_grad = False hidden_state.requires_grad = True class DataIterator(Dataset): """ Required dataset class to circumvent some of poutyne's limitation. The short version is that calling `.fit()` creates a TensorDataset (poutyne's own version) and it checks that the first dimension of all inputs are of same dimensions. The nature of RNNCell makes it such that the input and the hidden state cannot be aligned on the first dimension. So we create our own iterator and use `.fit_generator()` instead """ def __init__(self, constant_input, hidden_state, batch_size=32): self.constant_input = constant_input self.hidden_state = hidden_state self._batch_size = batch_size assert self.constant_input.shape[0] == self.hidden_state.shape[1] def __len__(self): num_items = self.constant_input.shape[0] l = num_items // self._batch_size l += 1 if num_items % self._batch_size != 0 else 0 return l def __iter__(self): last_idx = self.constant_input.shape[0] last_idx += last_idx % self._batch_size for start in range(0, last_idx, self._batch_size): end = start + self._batch_size x = self.constant_input[start:end] y = self.hidden_state[:, start:end] yield (x, y), y model = Model( network=self._rnn_cell, loss_function=speed_loss, optimizer=torch.optim.Adam(params=[hidden_state], lr=self._lr), ) model.fit_generator( DataIterator(constant_input, hidden_state, batch_size=self._batch_size), epochs=self._n_iter, verbose=False, callbacks=[ StepLR(step_size=1000, gamma=0.5), EarlyStopping(monitor="loss", min_delta=1e-6, patience=1000), ModelCheckpoint( filename=NamedTemporaryFile().name, monitor="loss", save_best_only=True, restore_best=True, ), ], ) trained = hidden_state.clone().detach() _, output = model.evaluate_generator( DataIterator(constant_input, trained, batch_size=self._batch_size), return_pred=True, ) output = np.concatenate([o[0] for o in output]) if trained.device.type == "cuda": trained = trained.detach().cpu().numpy() hidden_state = hidden_state.detach().cpu().numpy() else: trained = trained.detach().numpy() hidden_state = hidden_state.detach().numpy() return ( hidden_state.squeeze(), _speed_loss(np.squeeze(trained), np.squeeze(output)), )
def train(self, train_generator, valid_generator=None, *, callbacks=None, lr_schedulers=None, save_every_epoch=False, disable_tensorboard=False, epochs=1000, steps_per_epoch=None, validation_steps=None, batches_per_step=1, seed=42): # pylint: disable=too-many-locals """ Trains or finetunes the attribute model on a dataset using a generator. If a previous training already occured and lasted a total of `n_previous` epochs, then the model's weights will be set to the last checkpoint and the training will be resumed for epochs range (`n_previous`, `epochs`]. If the Experiment has logging enabled (i.e. self.logging is True), numerous callbacks will be automatically included. Notably, two :class:`~callbacks.ModelCheckpoint` objects will take care of saving the last and every new best (according to monitor mode) model weights in appropriate checkpoint files. :class:`~callbacks.OptimizerCheckpoint` and :class:`~callbacks.LRSchedulerCheckpoint` will also respectively handle the saving of the optimizer and LR scheduler's respective states for future retrieval. Moreover, a :class:`~callbacks.AtomicCSVLogger` will save all available epoch statistics in an output .tsv file. Lastly, a :class:`~callbacks.TensorBoardLogger` handles automatic TensorBoard logging of various neural network statistics. Args: train_generator: Generator-like object for the training set. See :func:`~Model.fit_generator()` for details on the types of generators supported. valid_generator (optional): Generator-like object for the validation set. See :func:`~Model.fit_generator()` for details on the types of generators supported. (Default value = None) callbacks (List[~poutyne.framework.callbacks.Callback]): List of callbacks that will be called during training. (Default value = None) lr_schedulers (List[~poutyne.framework.callbacks.lr_scheduler._PyTorchLRSchedulerWrapper]): List of learning rate schedulers. (Default value = None) save_every_epoch (bool, optional): Whether or not to save the experiment model's weights after every epoch. (Default value = False) disable_tensorboard (bool, optional): Whether or not to disable the automatic tensorboard logging callbacks. (Default value = False) epochs (int): Number of times the entire training dataset is seen. (Default value = 1000) steps_per_epoch (int, optional): Number of batch used during one epoch. Obviously, using this argument may cause one epoch not to see the entire training dataset or see it multiple times. (Defaults the number of steps needed to see the entire training dataset) validation_steps (int, optional): Same as for ``steps_per_epoch`` but for the validation dataset. (Defaults to ``steps_per_epoch`` if provided or the number of steps needed to see the entire validation dataset) batches_per_step (int): Number of batches on which to compute the running loss before backpropagating it through the network. Note that the total loss used for backpropagation is the mean of the `batches_per_step` batch losses. (Default value = 1) seed (int, optional): Seed used to make the sampling deterministic. (Default value = 42) Returns: List of dict containing the history of each epoch. """ set_seeds(seed) callbacks = [] if callbacks is None else callbacks lr_schedulers = [] if lr_schedulers is None else lr_schedulers # Copy callback list. callbacks = list(callbacks) tensorboard_writer = None initial_epoch = 1 if self.logging: if not os.path.exists(self.directory): os.makedirs(self.directory) # Restarting optimization if needed. initial_epoch = self._load_epoch_state(lr_schedulers) callbacks += [ AtomicCSVLogger(self.log_filename, separator='\t', append=initial_epoch != 1, temporary_filename=self.log_tmp_filename) ] callbacks += self._init_model_restoring_callbacks( initial_epoch, save_every_epoch) callbacks += [ ModelCheckpoint( self.model_checkpoint_filename, verbose=False, temporary_filename=self.model_checkpoint_tmp_filename) ] callbacks += [ OptimizerCheckpoint( self.optimizer_checkpoint_filename, verbose=False, temporary_filename=self.optimizer_checkpoint_tmp_filename) ] # We save the last epoch number after the end of the epoch so that the # _load_epoch_state() knows which epoch to restart the optimization. callbacks += [ PeriodicSaveLambda( lambda fd, epoch, logs: print(epoch, file=fd), self.epoch_filename, temporary_filename=self.epoch_tmp_filename, open_mode='w') ] tensorboard_writer, cb_list = self._init_tensorboard_callbacks( disable_tensorboard) callbacks += cb_list # This method returns callbacks that checkpoints the LR scheduler if logging is enabled. # Otherwise, it just returns the list of LR schedulers with a BestModelRestore callback. callbacks += self._init_lr_scheduler_callbacks(lr_schedulers) try: return self.model.fit_generator(train_generator, valid_generator, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, batches_per_step=batches_per_step, initial_epoch=initial_epoch, callbacks=callbacks) finally: if tensorboard_writer is not None: tensorboard_writer.close()
def train(args): set_random_seed(42) model = get_model(args.network, args.classification_head) print('Loading model') model.encoder.conv1 = nn.Conv2d(count_channels(args.channels) * args.neighbours, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) model, device = UtilsFactory.prepare_model(model) train_df = pd.read_csv(args.train_df).to_dict('records') val_df = pd.read_csv(args.val_df).to_dict('records') ds = Dataset(args.channels, args.dataset_path, args.image_size, args.batch_size, args.num_workers, args.neighbours, args.classification_head) loaders = ds.create_loaders(train_df, val_df) save_path = os.path.join(args.logdir, args.name) optimizer = get_optimizer(args.optimizer, args.lr, model) if not args.classification_head: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[10, 40, 80, 150, 300], gamma=0.1) criterion = get_loss(args.loss) runner = SupervisedRunner() if args.model_weights_path: checkpoint = torch.load(args.model_weights_path, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[DiceCallback()], logdir=save_path, num_epochs=args.epochs, verbose=True) infer_loader = collections.OrderedDict([('infer', loaders['valid'])]) runner.infer( model=model, loaders=infer_loader, callbacks=[ CheckpointCallback(resume=f'{save_path}/checkpoints/best.pth'), InferCallback() ], ) else: criterion = get_loss('multi') net = Model(model, optimizer, criterion, batch_metrics=[ classification_head_accuracy, segmentation_head_dice ]) net = net.to(device) net.fit_generator(loaders['train'], loaders['valid'], epochs=args.epochs, callbacks=[ ModelCheckpoint( f'{save_path}/checkpoints/best.pth', ), MultiStepLR(milestones=[10, 40, 80, 150, 300], gamma=0.1) ])
# Split into train/val sets total = len(dataset) lengths = [int(len(dataset) * SPLIT)] lengths.append(total - lengths[0]) print("Splitting into {} train and {} val".format(lengths[0], lengths[1])) train_set, val_set = random_split(dataset, lengths) # Setup dataloaders train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE) val_dataloader = DataLoader(val_set, batch_size=BATCH_SIZE) # Callbacks os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True) checkpoint = ModelCheckpoint(filename=os.path.join(MODEL_DIR, args.model + ".pt"), monitor="val_loss", save_best_only=True) writer = SummaryWriter(LOG_DIR) tb_logger = TensorBoardLogger(writer) callbacks = [checkpoint, tb_logger] # Metrics top3 = TopKAccuracy(k=3) top5 = TopKAccuracy(k=5) metrics = ["acc", top3, top5] # Train model = Model(network=net, optimizer="Adam", loss_function=nn.CrossEntropyLoss(), batch_metrics=metrics)
project_name="project_name", workspace="workspace_name") experiment.log_parameters(params) # create our special resnet18 cnn = resnet18(n_classes=4).to(device) # print the model summary to show useful information logging.info(summary(cnn, (3, 224, 244))) # define custom optimizer and instantiace the trainer `Model` optimizer = optim.Adam(cnn.parameters(), lr=params['lr']) model = Model(cnn, optimizer, "cross_entropy", batch_metrics=["accuracy"]).to(device) # usually you want to reduce the lr on plateau and store the best model callbacks = [ ReduceLROnPlateau(monitor="val_acc", patience=5, verbose=True), ModelCheckpoint(str(project.checkpoint_dir / f"{time.time()}-model.pt"), save_best_only="True", verbose=True), EarlyStopping(monitor="val_acc", patience=10, mode='max'), CometCallback(experiment) ] model.fit_generator( train_dl, val_dl, epochs=params['epochs'], callbacks=callbacks, ) # # get the results on the test set # loss, test_acc = model.evaluate_generator(test_dl) # logging.info(f'test_acc=({test_acc})') # experiment.log_metric('test_acc', test_acc)
def launch(dataset, experiment_name, network, hidden_size, hidden_layers, sample_size, weight_decay, prior,\ learning_rate, lr_patience, optim_algo, epochs, batch_size, valid_size, pre_epochs, stop_early,\ gpu_device, random_seed, logging): # Setting random seed for reproducibility random_state = check_random_state(random_seed) torch.manual_seed(random_seed) # Pac-Bayes Bound parameters delta = 0.05 C_range = torch.Tensor(np.arange(0.1, 20.0, 0.01)) # Setting GPU device device = None if torch.cuda.is_available() and gpu_device != -1: torch.cuda.set_device(gpu_device) device = torch.device('cuda:%d' % gpu_device) print("Running on GPU %d" % gpu_device) else: print("Running on CPU") # Logging experiment_setting = dict([('experiment_name', experiment_name), ('dataset', dataset), ('network', network), ('hidden_size', hidden_size), ('hidden_layers', hidden_layers), ('sample_size', sample_size), ('epochs', epochs), ('weight_decay', weight_decay), ('prior', prior), ('learning_rate', learning_rate), ('lr_patience', lr_patience), ('optim_algo', optim_algo), ('batch_size', batch_size), ('valid_size', valid_size), ('pre_epochs', pre_epochs), ('stop_early', stop_early), ('random_seed', random_seed)]) directory_name = get_logging_dir_name(experiment_setting) logging_path = join(RESULTS_PATH, experiment_name, dataset, directory_name) if logging: if not exists(logging_path): makedirs(logging_path) with open(join(logging_path, "setting.json"), 'w') as out_file: json.dump(experiment_setting, out_file, sort_keys=True, indent=4) # Loading dataset dataset_loader = DatasetLoader(random_state=random_state) X_train, X_test, y_train, y_test = dataset_loader.load(dataset) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=valid_size, random_state=random_state) # Experiment batch_metrics = [accuracy] epoch_metrics = [] save_every_epoch = False cost_function = linear_loss monitor_metric = 'val_loss' valid_set_use = 'val' callbacks = [] if network in ['pbgnet', 'pbcombinet']: print("### Using Pac-Bayes Binary Gradient Network ###") if prior in ['zero', 'init']: valid_set_use = 'train' X_train = np.vstack([X_train, X_valid]) y_train = np.vstack([y_train, y_valid]) elif prior == 'pretrain': valid_set_use = 'pretrain' if network == 'pbgnet': net = PBGNet(X_train.shape[1], hidden_layers * [hidden_size], X_train.shape[0], sample_size, delta) else: net = PBCombiNet(X_train.shape[1], hidden_layers * [hidden_size], X_train.shape[0], delta) monitor_metric = 'bound' cost_function = net.bound epoch_metrics.append( MasterMetricLogger(network=net, loss_function=linear_loss, delta=delta, n_examples=X_train.shape[0])) elif network in ['pbgnet_ll', 'pbcombinet_ll']: print( "### Using PAC-Bayes Gradient Network Architecture and Optimizing Linear Loss ###" ) if network == 'pbgnet_ll': net = PBGNet(X_train.shape[1], hidden_layers * [hidden_size], X_train.shape[0], sample_size, delta) else: net = PBCombiNet(X_train.shape[1], hidden_layers * [hidden_size], X_train.shape[0], delta) epoch_metrics.append( MasterMetricLogger(network=net, loss_function=linear_loss, delta=delta, n_examples=X_train.shape[0], C_range=C_range.to(device))) callbacks.append( ModelCheckpoint(join(logging_path, 'bound_checkpoint_epoch.ckpt'), temporary_filename=join( logging_path, 'bound_checkpoint_epoch.tmp.ckpt'), monitor='bound', mode='min', save_best_only=True)) elif network == "baseline": print("### Running the Baseline Network with Tanh activations ###") net = BaselineNet(X_train.shape[1], hidden_layers * [hidden_size], torch.nn.Tanh) if network.startswith('pb'): epoch_metrics.append(MetricLogger(network=net, key='bound')) epoch_metrics.append(MetricLogger(network=net, key='kl')) epoch_metrics.append(MetricLogger(network=net, key='C')) # Parameters initialization if prior in ['zero', 'init']: net.init_weights() elif prior == 'pretrain': print("### Pre-training network ###") if network == 'pbgnet': pre_net = PBGNet(X_valid.shape[1], hidden_layers * [hidden_size], X_valid.shape[0], sample_size, delta) else: pre_net = PBCombiNet(X_valid.shape[1], hidden_layers * [hidden_size], X_valid.shape[0], delta) pre_net.init_weights() pre_optimizer = torch.optim.Adam(pre_net.parameters(), lr=learning_rate, weight_decay=0.0) pre_logging_path = join(logging_path, 'pretrain') if not exists(pre_logging_path): makedirs(pre_logging_path) pretrain = Experiment(directory=pre_logging_path, network=pre_net, optimizer=pre_optimizer, loss_function=linear_loss, monitor_metric='loss', device=device, logging=logging, batch_metrics=[accuracy]) pretrain_loader = DataLoader(TensorDataset(torch.Tensor(X_valid), torch.Tensor(y_valid)), batch_size, shuffle=True) pretrain.train(train_generator=pretrain_loader, valid_generator=None, epochs=pre_epochs, save_every_epoch=False, disable_tensorboard=True, seed=random_seed) history = pd.read_csv(pretrain.log_filename, sep='\t') best_epoch_index = history['loss'].idxmin() best_epoch_stats = history.iloc[best_epoch_index:best_epoch_index + 1] best_epoch = best_epoch_stats['epoch'].item() ckpt_filename = pretrain.best_checkpoint_filename.format( epoch=best_epoch) weights = torch.load(ckpt_filename, map_location='cpu') net.load_state_dict(weights, strict=False) print("### Training ###") # Setting prior if network.startswith('pb') and prior in ['init', 'pretrain']: net.set_priors(net.state_dict()) # Adding early stopping and lr scheduler reduce_lr = ReduceLROnPlateau(monitor=monitor_metric, mode='min', patience=lr_patience, factor=0.5, \ threshold_mode='abs', threshold=1e-4, verbose=True) lr_schedulers = [reduce_lr] early_stopping = EarlyStopping(monitor=monitor_metric, mode='min', min_delta=1e-4, patience=stop_early, verbose=True) if stop_early > 0: callbacks.append(early_stopping) # Initializing optimizer if optim_algo == "sgd": optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=weight_decay) elif optim_algo == "adam": optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay) # Creating Poutyne experiment expt = Experiment(directory=logging_path, network=net, optimizer=optimizer, loss_function=cost_function, monitor_metric=monitor_metric, device=device, logging=logging, batch_metrics=batch_metrics, epoch_metrics=epoch_metrics) # Initializing data loaders train_loader = DataLoader(TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train)), batch_size, shuffle=True) valid_loader = None if valid_set_use == 'val': valid_loader = DataLoader( TensorDataset(torch.Tensor(X_valid), torch.Tensor(y_valid)), batch_size) # Launching training expt.train(train_generator=train_loader, valid_generator=valid_loader, epochs=epochs, callbacks=callbacks, lr_schedulers=lr_schedulers, save_every_epoch=save_every_epoch, disable_tensorboard=True, seed=random_seed) print("### Testing ###") sign_act_fct = lambda: Lambda(lambda x: torch.sign(x)) test_loader = DataLoader( TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test)), batch_size) if network == 'baseline': expt.test(test_generator=test_loader, checkpoint='best', seed=random_seed) # Binary network testing (sign activation) best_epoch = expt.get_best_epoch_stats()['epoch'].item() ckpt_filename = expt.best_checkpoint_filename.format(epoch=best_epoch) binary_net = BaselineNet(X_test.shape[1], hidden_layers * [hidden_size], sign_act_fct) weights = torch.load(ckpt_filename, map_location='cpu') binary_net.load_state_dict(weights, strict=False) binary_model = Model(binary_net, 'sgd', linear_loss, batch_metrics=[accuracy]) test_loss, test_accuracy = binary_model.evaluate_generator(test_loader, steps=None) test_stats = pd.read_csv(expt.test_log_filename.format(name='test'), sep='\t') test_stats['bin_test_linear_loss'] = test_loss test_stats['bin_test_accuracy'] = test_accuracy test_stats['linear_loss'] = test_stats['loss'] test_stats['val_linear_loss'] = test_stats['val_loss'] test_stats['test_linear_loss'] = test_stats['test_loss'] test_stats.to_csv(expt.test_log_filename.format(name='test'), sep='\t', index=False) def pbgnet_testing(target_metric, irrelevant_columns, n_repetitions=20): print(f"Restoring best model according to {target_metric}") # Cleaning logs history = pd.read_csv(expt.log_filename, sep='\t').drop(irrelevant_columns, axis=1, errors='ignore') history.to_csv(expt.log_filename, sep='\t', index=False) # Loading best weights best_epoch_index = history[target_metric].idxmin() best_epoch_stats = history.iloc[best_epoch_index:best_epoch_index + 1].reset_index(drop=True) best_epoch = best_epoch_stats['epoch'].item() print(f"Found best checkpoint at epoch: {best_epoch}") ckpt_filename = expt.best_checkpoint_filename.format(epoch=best_epoch) if network in ['pbgnet_ll', 'pbcombinet_ll' ] and target_metric == 'bound': ckpt_filename = join(logging_path, 'bound_checkpoint_epoch.ckpt') weights = torch.load(ckpt_filename, map_location='cpu') # Binary network testing (sign activation) binary_net = BaselineNet(X_test.shape[1], hidden_layers * [hidden_size], sign_act_fct) updated_weights = {} for name, weight in weights.items(): if name.startswith('layers'): name = name.split('.', 2) name[1] = str(2 * int(name[1])) name = '.'.join(name) updated_weights[name] = weight binary_net.load_state_dict(updated_weights, strict=False) binary_model = Model(binary_net, 'sgd', linear_loss, batch_metrics=[accuracy]) test_loss, test_accuracy = binary_model.evaluate_generator(test_loader, steps=None) best_epoch_stats['bin_test_linear_loss'] = test_loss best_epoch_stats['bin_test_accuracy'] = test_accuracy model = expt.model model.load_weights(ckpt_filename) def repeat_inference(loader, prefix='', drop_keys=[], n_times=20): metrics_names = [prefix + 'loss'] + [ prefix + metric_name for metric_name in model.metrics_names ] metrics_list = [] for _ in range(n_times): loss, metrics = model.evaluate_generator(loader, steps=None) if not isinstance(metrics, np.ndarray): metrics = np.array([metrics]) metrics_list.append(np.concatenate(([loss], metrics))) metrics_list = [list(e) for e in zip(*metrics_list)] metrics_stats = pd.DataFrame( {col: val for col, val in zip(metrics_names, metrics_list)}) return metrics_stats.drop(drop_keys, axis=1, errors='ignore') metrics_stats = repeat_inference(train_loader, n_times=n_repetitions) metrics_stats = metrics_stats.join( repeat_inference(test_loader, prefix='test_', drop_keys=['test_bound', 'test_kl', 'test_C'], n_times=n_repetitions)) best_epoch_stats = best_epoch_stats.drop(metrics_stats.keys().tolist(), axis=1, errors='ignore') metrics_stats = metrics_stats.join( pd.concat([best_epoch_stats] * n_repetitions, ignore_index=True)) log_filename = expt.test_log_filename.format(name='test') if network in ['pbgnet_ll', 'pbcombinet_ll' ] and target_metric == 'bound': log_filename = join(logging_path, 'bound_test_log.tsv') metrics_stats.to_csv(log_filename, sep='\t', index=False) default_irrelevant_columns = ['val_bound', 'val_kl', 'val_C'] if network == 'pbgnet_ll': pbgnet_testing(target_metric='val_loss', irrelevant_columns=default_irrelevant_columns, n_repetitions=20) pbgnet_testing(target_metric='bound', irrelevant_columns=default_irrelevant_columns, n_repetitions=20) elif network == 'pbgnet': pbgnet_testing( target_metric='bound', irrelevant_columns=['val_loss', 'val_accuracy', 'val_linear_loss' ] + default_irrelevant_columns, n_repetitions=20) elif network == 'pbcombinet_ll': pbgnet_testing(target_metric='val_loss', irrelevant_columns=default_irrelevant_columns, n_repetitions=1) pbgnet_testing(target_metric='bound', irrelevant_columns=default_irrelevant_columns, n_repetitions=1) elif network == 'pbcombinet': pbgnet_testing( target_metric='bound', irrelevant_columns=['val_loss', 'val_accuracy', 'val_linear_loss' ] + default_irrelevant_columns, n_repetitions=1) if logging: with open(join(logging_path, 'done.txt'), 'w') as done_file: done_file.write("done") print("### DONE ###")
# create our special resnet18 args = {'classes': 2} cnn = Text_CNN(args).to(device) # print the model summary to show useful information # logging.info(summary(cnn, (1,2000), device='cpu')) # define custom optimizer and instantiace the trainer `Model` optimizer = optim.Adam(cnn.parameters(), lr=params['lr']) model = Model(cnn, optimizer, "cross_entropy", batch_metrics=["accuracy"]).to(device) # usually you want to reduce the lr on plateau and store the best model callbacks = [ ReduceLROnPlateau(monitor="val_acc", patience=5, verbose=True), ModelCheckpoint(str(project.checkpoint_dir / 'best_epoch_{epoch}.ckpt'), monitor='val_acc', mode='max', save_best_only=True, restore_best=True, verbose=True, temporary_filename='best_epoch.ckpt.tmp'), EarlyStopping(monitor="val_acc", patience=10, mode='max'), CometCallback(experiment) ] model.fit_generator( train_dl, val_dl, epochs=30, callbacks=callbacks, ) # get the results on the test set loss, test_acc = model.evaluate_generator(test_dl) logging.info(f'test_acc=({test_acc})')