def __init__(self, model, controller, hyperparameters): super(Trainer, self).__init__() self.model = model self.controller = controller self.hyperparameters = hyperparameters self.gen_opt = optimization.create_optimizer_from_params( hyperparameters['gen']['optimizer']) self.dis_opt = optimization.create_optimizer_from_params( hyperparameters['dis']['optimizer']) self.control_opt = optimization.create_optimizer_from_params( hyperparameters['control']['optimizer']) self.dis_loss_criterion = utils.get_loss_fn('bce') self.ll_loss_criterion = utils.get_loss_fn('mae') self.z_recon_loss_criterion = utils.get_loss_fn('mae') self.control_loss_criterion = utils.get_loss_fn( hyperparameters['loss']['control'])
def test_mae_loss(self): loss = utils.get_loss_fn('mae') a = tf.constant([[0.0, 0.0], [0.0, 0.0]]) b = tf.constant([[1.0, 1.0], [2.0, 2.0]]) loss_result = loss(a, b) # Should be 1/2 * (1/2 * (1 + 1) + 1/2 * (2 + 2)) = 1.5. self.assertEqual(tf.constant(1.5), loss_result)
def test_mse_loss(self): loss = utils.get_loss_fn('mse') a = tf.constant([[0.0, 0.0], [0.0, 0.0]]) b = tf.constant([[1.0, 1.0], [2.0, 2.0]]) loss_result = loss(a, b) # Should be 1/2 * (1/2 * (1**2 + 1**2) + 1/2 * (2**2 + 2**2)) = 2.5. self.assertEqual(tf.constant(2.5), loss_result)
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def evaluate(args, mode, save_pred=False): _, val_loader, test_loader = get_dataloaders(args) if mode == 'val': dataloader = val_loader elif mode == 'test': dataloader = test_loader else: raise ValueError(f"{mode} not supported. Choose from 'val' or 'test'") model = UNetVgg16(n_classes=args.n_classes).to(args.device) model.load_state_dict(torch.load(args.model_path)['model_state_dict'], strict=False) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) eval_loss, scores = eval_epoch(model=model, dataloader=dataloader, n_classes=args.n_classes, criterion=criterion, device=args.device, pred_dir=save_pred and args.pred_dir) miou, acc = scores['mIoU'], scores['accuracy'] print( f"{mode} | mIoU: {miou:.3f} | accuracy: {acc:.3f} | loss: {eval_loss:.3f}" ) return scores
def train_full_model(self, epochs=None, optimizer_name='sgd', optimizer_lr=1e-5, weights_decay=None, early_stop_cb=True, reduce_lr_cb=True, tboard_cb=None, save_intermediate_models=False, save_last_model=True, jupyter_mode=False): epochs = 800 if epochs is None else epochs model_name = self.create_model_name('full', top_model_opt=self.top_model_opt, top_model_lr=self.top_model_lr, full_model_opt=optimizer_name, full_model_lr=optimizer_lr, full_model_decay=weights_decay) model_path = os.path.join('models', model_name) # Save model information for inference dump_json(self.config, os.path.join('configs', model_name), indent=4) # Compile options # Fit automatically scales loss full_model_optimizer = get_optimizer(optimizer_name, optimizer_lr) loss_fn = get_loss_fn(self.n_outputs, self.loss_name), metrics = tf.keras.metrics.MeanAbsoluteError() # Callbacks self.prepare_full_model_callbacks( model_name, early_stop_cb=early_stop_cb, reduce_lr_cb=reduce_lr_cb, tboard_cb=tboard_cb, save_intermediate_models=save_intermediate_models, save_models_path=model_path, jupyter_mode=jupyter_mode) self.prepare_full_model_layers() add_l2_weights_decay(self.full_model, weights_decay) self.full_model.compile(optimizer=full_model_optimizer, loss=loss_fn, metrics=metrics) self.full_model_history = self.full_model.fit( x=self.train_ds, validation_data=self.valid_ds, epochs=epochs, # Shuffling in dataset shuffle=False, callbacks=self.full_model_callbacks, verbose=0) history_data = { 'history': numpy_dict_to_json_format(self.full_model_history.history), 'model_name': model_name } if self.use_test_split: test_scores = self.full_model.evaluate(self.test_ds) test_scores = dict(zip(self.full_model.metrics_names, test_scores)) history_data['test_scores'] = numpy_dict_to_json_format( test_scores) dump_json(history_data, os.path.join('history', model_name)) if save_last_model: self.full_model.save(model_path + '_final', include_optimizer=False)
def train_top_model(self, epochs=None, optimizer_name='adam', optimizer_lr=0.001, early_stop_cb=True, reduce_lr_cb=True, tboard_cb=None, save_last_model=True, jupyter_mode=False): if epochs is None: epochs = 100 if self.tl_workflow == 'fast' else 300 batch_size = 32 self.top_model_opt = optimizer_name self.top_model_lr = optimizer_lr model_name = self.create_model_name('top', top_model_opt=self.top_model_opt, top_model_lr=self.top_model_lr) # Save model information for inference dump_json(self.config, os.path.join('configs', model_name), indent=4) # Compile options # Fit automatically scales loss top_model_optimizer = get_optimizer(optimizer_name, optimizer_lr) loss_fn = get_loss_fn(self.n_outputs, self.loss_name), metrics = tf.keras.metrics.MeanAbsoluteError() # Callbacks self.prepare_top_model_callbacks(model_name, early_stop_cb=early_stop_cb, reduce_lr_cb=reduce_lr_cb, tboard_cb=tboard_cb, jupyter_mode=jupyter_mode) if self.tl_workflow == 'fast': self.create_bottleneck_data(jupyter_mode) self.top_model.compile(optimizer=top_model_optimizer, loss=loss_fn, metrics=metrics) self.top_model_history = self.top_model.fit( x=self.train_bottleneck_preds, y=self.train_bottleneck_target, validation_data=(self.valid_bottleneck_preds, self.valid_bottleneck_target), epochs=epochs, batch_size=batch_size, validation_batch_size=batch_size, shuffle=True, callbacks=self.top_model_callbacks, verbose=0) if self.use_test_split: test_scores = self.top_model.evaluate( self.test_bottleneck_preds, self.test_bottleneck_target, batch_size=batch_size) test_scores = dict( zip(self.top_model.metrics_names, test_scores)) else: self.full_model.layers[0].trainable = False self.full_model.compile(optimizer=top_model_optimizer, loss=loss_fn, metrics=metrics) self.top_model_history = self.full_model.fit( x=self.train_ds, validation_data=self.valid_ds, epochs=epochs, # Shuffling in dataset shuffle=False, callbacks=self.top_model_callbacks, verbose=0) if self.use_test_split: test_scores = self.full_model.evaluate(self.test_ds) test_scores = dict( zip(self.full_model.metrics_names, test_scores)) history_data = { 'history': numpy_dict_to_json_format(self.top_model_history.history), 'model_name': model_name } if self.use_test_split: history_data['test_scores'] = numpy_dict_to_json_format( test_scores) dump_json(history_data, os.path.join('history', model_name)) if save_last_model: self.full_model.save(os.path.join('models', model_name), include_optimizer=False)
def run(args): start_epoch = 1 best = {'L1': 1e+9, 'MAE': 1e+9} # logs if args.expid == '': args.expid = dt.datetime.now().strftime('%Y%m%d%H%M') args.log_dir = os.path.join(args.log_dir, args.expid) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) os.chmod(args.log_dir, 0o0777) logger = get_logger(os.path.join(args.log_dir, 'main.log')) logger.info(args) writer = SummaryWriter(args.log_dir) args.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # data if args.trainset == 'trainset': train_set = WCTrainset(args.data_root, args.train_csv, args=args) else: train_set = WCDataset(args.data_root, args.train_csv, args=args) valid_set = WCValidset(args.data_root, args.valid_csv, args=args) train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=False) # network model = models.__dict__[args.model](args=args) if torch.cuda.device_count() > 1: logger.info('{} GPUs found.'.format(torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(args.device) # training criterion, valid_loss_fn = get_loss_fn(args) optimizer = get_optimizer(model, args.optim_str) scheduler = get_scheduler(optimizer, args) logger.debug(optimizer) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best['L1'] = checkpoint['best/L1'] best['MAE'] = checkpoint['best/MAE'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): message = '[{}] Epoch {} Train/{} {:.2f} /MAE {:.4f} Valid/L1 {:.2f} /MAE {:.4f} (Best {:.4f}) ' # noqa for param_group in optimizer.param_groups: message += 'LR {:.4f} '.format(param_group['lr']) training = train(train_loader, model, criterion, optimizer, logger=logger, args=args) validation = validate(valid_loader, model, valid_loss_fn, logger=logger, args=args) writer.add_scalar('{}/Train'.format(args.loss), training['loss'], epoch_i) writer.add_scalar('{}/Valid'.format(args.loss), validation['loss'], epoch_i) writer.add_scalar('MAE/Train', training['mae'], epoch_i) writer.add_scalar('MAE/Valid', validation['mae'], epoch_i) writer.add_scalar('Grad/L2/Mean/BeforeClipped/Train', training['grad/L2/BeforeClipped'], epoch_i) writer.add_scalar('Grad/L2/Mean/Clipped/Train', training['grad/L2/Clipped'], epoch_i) writer.add_scalar('Grad/L2/Mean/Train', training['grad/L2/Mean'], epoch_i) if epoch_i % args.freq_to_log_image == 0: writer.add_image('Train/Predict', _get_images(training['pred'], args), epoch_i) writer.add_image('Train/Target', _get_images(training['true'], args), epoch_i) writer.add_image('Valid/Predict', _get_images(validation['pred'], args), epoch_i) writer.add_image('Valid/Target', _get_images(validation['true'], args), epoch_i) is_best = (validation['mae'] < best['MAE'], validation['loss'] < best['L1']) if is_best[0]: best['MAE'] = validation['mae'] if is_best[1]: best['L1'] = validation['loss'] save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'valid/L1': validation['loss'], 'valid/MAE': validation['mae'], 'best/L1': best['L1'], 'best/MAE': best['MAE'], 'optimizer': optimizer.state_dict(), }, is_best, args.log_dir) if scheduler is not None: scheduler.step(epoch=epoch_i) message = message.format(args.expid, epoch_i, args.loss, training['loss'], training['mae'], validation['loss'], validation['mae'], best['MAE']) logger.info(message)
def main(args): start_epoch = 1 best_loss = 1e+6 if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) os.chmod(args.log_dir, 0o0777) logger = get_logger(os.path.join(args.log_dir, 'main.log')) logger.info(args) writer = SummaryWriter(args.log_dir) # data train_set = MovingMNIST(root='./data/train', train=True, download=True) test_set = MovingMNIST(root='./data/test', train=False, download=True) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False) # network model = models.__dict__[args.model](args=args) model = nn.DataParallel(model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) # training criterion = get_loss_fn(args) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): model.train() losses = 0. for i, (inputs, targets) in enumerate(train_loader): bs, ts, h, w = targets.size() inputs = inputs.unsqueeze(2) inputs, targets = inputs.float() / 255., targets.float() / 255. inputs, targets = inputs.to(args.device), targets.to(args.device) outputs = model(inputs) # (bs ,ts, c, h, w) -> (bs, ts, h, w) -> (ts, bs, h, w) outputs = outputs.squeeze(2).permute(1, 0, 2, 3) # (bs, ts, h, w) -> (ts, bs, h, w) targets = targets.permute(1, 0, 2, 3) loss = 0. for t_i in range(ts): loss += criterion(outputs[t_i], targets[t_i]) / bs losses += loss.item() * bs optimizer.zero_grad() loss.backward() optimizer.step() logger.debug('Train/Batch {}/{}'.format(i + 1, len(train_loader))) model.eval() test_losses = 0. for i, (inputs, targets) in enumerate(test_loader): bs, ts, h, w = targets.size() inputs = inputs.unsqueeze(2) inputs, targets = inputs.float() / 255., targets.float() / 255. inputs, targets = inputs.to(args.device), targets.to(args.device) with torch.no_grad(): outputs = model(inputs) # (bs ,ts, c, h, w) -> (bs, ts, h, w) -> (ts, bs, h, w) outputs = outputs.squeeze(2).permute(1, 0, 2, 3) # (bs, ts, h, w) -> (ts, bs, h, w) targets = targets.permute(1, 0, 2, 3) loss = 0. for t_i in range(ts): loss += criterion(outputs[t_i], targets[t_i]) / bs test_losses += loss.item() * bs logger.debug('Test/Batch {}/{}'.format(i + 1, len(test_loader))) train_loss = losses / len(train_set) test_loss = test_losses / len(test_set) writer.add_scalar('Train/{}'.format(args.loss), train_loss, epoch_i) writer.add_scalar('Test/{}'.format(args.loss), test_loss, epoch_i) logger.info('Epoch {} Train/Loss {:.4f} Test/Loss {:.4f}'.format( epoch_i, train_loss, test_loss)) is_best = test_loss < best_loss if test_loss < best_loss: best_loss = test_loss save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'test_loss': test_loss, 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), }, is_best, args.log_dir) if scheduler is not None: scheduler.step()
def run(args): start_epoch = 1 best_loss = 1e+9 # logs args.logdir = get_logdir(args) logger = get_logger(os.path.join(args.logdir, 'main.log')) logger.info(args) writer = SummaryWriter(args.logdir) # data train_set = MovingMNIST(root='./data', train=True, download=True) valid_set = MovingMNIST(root='./data', train=False, download=True, split=args.test_size) train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=False) # network model = models.__dict__[args.model](args=args) model = nn.DataParallel(model) args.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(args.device) # training criterion = get_loss_fn(args) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_loss = checkpoint['best/{}'.format(args.loss)] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): training = train(train_loader, model, criterion, optimizer, logger=logger, args=args) validation = validate(valid_loader, model, criterion, logger=logger, args=args) writer.add_scalar('Train/{}'.format(args.loss), training[args.loss], epoch_i) writer.add_scalar('Valid/{}'.format(args.loss), validation[args.loss], epoch_i) writer.add_image('Train/Predict', _get_images(training['output'], args), epoch_i) writer.add_image('Train/Target', _get_images(training['target'], args), epoch_i) writer.add_image('Valid/Predict', _get_images(validation['output'], args), epoch_i) writer.add_image('Valid/Target', _get_images(validation['target'], args), epoch_i) message = '[{}] Epoch {} Train/{} {:.4f} Valid/{} {:.4f} ' message = message.format( args.expid, epoch_i, args.loss, training[args.loss], args.loss, validation[args.loss], ) is_best = validation[args.loss] < best_loss if is_best: best_loss = validation[args.loss] message += '(Best)' save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'valid/{}'.format(args.loss): validation[args.loss], 'best/{}'.format(args.loss): best_loss, 'optimizer': optimizer.state_dict(), }, is_best, args.logdir) if scheduler is not None: scheduler.step(epoch=epoch_i) logger.debug('Scheduler stepped.') for param_group in optimizer.param_groups: logger.debug(param_group['lr']) logger.info(message)