예제 #1
0
파일: train.py 프로젝트: niksaz/sim2real
 def __init__(self, model, controller, hyperparameters):
     super(Trainer, self).__init__()
     self.model = model
     self.controller = controller
     self.hyperparameters = hyperparameters
     self.gen_opt = optimization.create_optimizer_from_params(
         hyperparameters['gen']['optimizer'])
     self.dis_opt = optimization.create_optimizer_from_params(
         hyperparameters['dis']['optimizer'])
     self.control_opt = optimization.create_optimizer_from_params(
         hyperparameters['control']['optimizer'])
     self.dis_loss_criterion = utils.get_loss_fn('bce')
     self.ll_loss_criterion = utils.get_loss_fn('mae')
     self.z_recon_loss_criterion = utils.get_loss_fn('mae')
     self.control_loss_criterion = utils.get_loss_fn(
         hyperparameters['loss']['control'])
예제 #2
0
 def test_mae_loss(self):
     loss = utils.get_loss_fn('mae')
     a = tf.constant([[0.0, 0.0], [0.0, 0.0]])
     b = tf.constant([[1.0, 1.0], [2.0, 2.0]])
     loss_result = loss(a, b)
     # Should be 1/2 * (1/2 * (1 + 1) + 1/2 * (2 + 2)) = 1.5.
     self.assertEqual(tf.constant(1.5), loss_result)
예제 #3
0
 def test_mse_loss(self):
     loss = utils.get_loss_fn('mse')
     a = tf.constant([[0.0, 0.0], [0.0, 0.0]])
     b = tf.constant([[1.0, 1.0], [2.0, 2.0]])
     loss_result = loss(a, b)
     # Should be 1/2 * (1/2 * (1**2 + 1**2) + 1/2 * (2**2 + 2**2)) = 2.5.
     self.assertEqual(tf.constant(2.5), loss_result)
예제 #4
0
파일: train.py 프로젝트: leibo-cmu/MatSeg
def train(args):
    Arguments.save_args(args, args.args_path)
    train_loader, val_loader, _ = get_dataloaders(args)
    model = UNetVgg16(n_classes=args.n_classes).to(args.device)
    optimizer = get_optimizer(args.optimizer, model)
    lr_scheduler = LRScheduler(args.lr_scheduler, optimizer)
    criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device)
    model_saver = ModelSaver(args.model_path)
    recorder = Recorder(['train_miou', 'train_acc', 'train_loss',
                         'val_miou', 'val_acc', 'val_loss'])
    for epoch in range(args.n_epochs):
        print(f"{args.experim_name} Epoch {epoch+1}:")
        train_loss, train_acc, train_miou, train_ious = train_epoch(
            model=model,
            dataloader=train_loader,
            n_classes=args.n_classes,
            optimizer=optimizer,
            lr_scheduler=lr_scheduler,
            criterion=criterion,
            device=args.device,
        )
        print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}")
        val_loss, val_scores = eval_epoch(
            model=model,
            dataloader=val_loader,
            n_classes=args.n_classes,
            criterion=criterion,
            device=args.device,
        )
        val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy']
        print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}")
        recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss])
        recorder.save(args.record_path)
        if args.metric.startswith("IoU"):
            metric = val_ious[int(args.metric.split('_')[1])]
        else: metric = val_miou
        model_saver.save_models(metric, epoch+1, model,
                                ious={'train': train_ious, 'val': val_ious})

    print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
예제 #5
0
파일: eval.py 프로젝트: leibo-cmu/MatSeg
def evaluate(args, mode, save_pred=False):
    _, val_loader, test_loader = get_dataloaders(args)
    if mode == 'val':
        dataloader = val_loader
    elif mode == 'test':
        dataloader = test_loader
    else:
        raise ValueError(f"{mode} not supported. Choose from 'val' or 'test'")
    model = UNetVgg16(n_classes=args.n_classes).to(args.device)
    model.load_state_dict(torch.load(args.model_path)['model_state_dict'],
                          strict=False)
    criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device)
    eval_loss, scores = eval_epoch(model=model,
                                   dataloader=dataloader,
                                   n_classes=args.n_classes,
                                   criterion=criterion,
                                   device=args.device,
                                   pred_dir=save_pred and args.pred_dir)
    miou, acc = scores['mIoU'], scores['accuracy']
    print(
        f"{mode} | mIoU: {miou:.3f} | accuracy: {acc:.3f} | loss: {eval_loss:.3f}"
    )
    return scores
예제 #6
0
    def train_full_model(self,
                         epochs=None,
                         optimizer_name='sgd',
                         optimizer_lr=1e-5,
                         weights_decay=None,
                         early_stop_cb=True,
                         reduce_lr_cb=True,
                         tboard_cb=None,
                         save_intermediate_models=False,
                         save_last_model=True,
                         jupyter_mode=False):
        epochs = 800 if epochs is None else epochs

        model_name = self.create_model_name('full',
                                            top_model_opt=self.top_model_opt,
                                            top_model_lr=self.top_model_lr,
                                            full_model_opt=optimizer_name,
                                            full_model_lr=optimizer_lr,
                                            full_model_decay=weights_decay)
        model_path = os.path.join('models', model_name)

        # Save model information for inference
        dump_json(self.config, os.path.join('configs', model_name), indent=4)

        # Compile options
        # Fit automatically scales loss
        full_model_optimizer = get_optimizer(optimizer_name, optimizer_lr)
        loss_fn = get_loss_fn(self.n_outputs, self.loss_name),
        metrics = tf.keras.metrics.MeanAbsoluteError()

        # Callbacks
        self.prepare_full_model_callbacks(
            model_name,
            early_stop_cb=early_stop_cb,
            reduce_lr_cb=reduce_lr_cb,
            tboard_cb=tboard_cb,
            save_intermediate_models=save_intermediate_models,
            save_models_path=model_path,
            jupyter_mode=jupyter_mode)

        self.prepare_full_model_layers()
        add_l2_weights_decay(self.full_model, weights_decay)
        self.full_model.compile(optimizer=full_model_optimizer,
                                loss=loss_fn,
                                metrics=metrics)
        self.full_model_history = self.full_model.fit(
            x=self.train_ds,
            validation_data=self.valid_ds,
            epochs=epochs,
            # Shuffling in dataset
            shuffle=False,
            callbacks=self.full_model_callbacks,
            verbose=0)

        history_data = {
            'history':
            numpy_dict_to_json_format(self.full_model_history.history),
            'model_name': model_name
        }
        if self.use_test_split:
            test_scores = self.full_model.evaluate(self.test_ds)
            test_scores = dict(zip(self.full_model.metrics_names, test_scores))
            history_data['test_scores'] = numpy_dict_to_json_format(
                test_scores)
        dump_json(history_data, os.path.join('history', model_name))

        if save_last_model:
            self.full_model.save(model_path + '_final',
                                 include_optimizer=False)
예제 #7
0
    def train_top_model(self,
                        epochs=None,
                        optimizer_name='adam',
                        optimizer_lr=0.001,
                        early_stop_cb=True,
                        reduce_lr_cb=True,
                        tboard_cb=None,
                        save_last_model=True,
                        jupyter_mode=False):
        if epochs is None:
            epochs = 100 if self.tl_workflow == 'fast' else 300
        batch_size = 32

        self.top_model_opt = optimizer_name
        self.top_model_lr = optimizer_lr
        model_name = self.create_model_name('top',
                                            top_model_opt=self.top_model_opt,
                                            top_model_lr=self.top_model_lr)

        # Save model information for inference
        dump_json(self.config, os.path.join('configs', model_name), indent=4)

        # Compile options
        # Fit automatically scales loss
        top_model_optimizer = get_optimizer(optimizer_name, optimizer_lr)
        loss_fn = get_loss_fn(self.n_outputs, self.loss_name),
        metrics = tf.keras.metrics.MeanAbsoluteError()

        # Callbacks
        self.prepare_top_model_callbacks(model_name,
                                         early_stop_cb=early_stop_cb,
                                         reduce_lr_cb=reduce_lr_cb,
                                         tboard_cb=tboard_cb,
                                         jupyter_mode=jupyter_mode)

        if self.tl_workflow == 'fast':
            self.create_bottleneck_data(jupyter_mode)
            self.top_model.compile(optimizer=top_model_optimizer,
                                   loss=loss_fn,
                                   metrics=metrics)
            self.top_model_history = self.top_model.fit(
                x=self.train_bottleneck_preds,
                y=self.train_bottleneck_target,
                validation_data=(self.valid_bottleneck_preds,
                                 self.valid_bottleneck_target),
                epochs=epochs,
                batch_size=batch_size,
                validation_batch_size=batch_size,
                shuffle=True,
                callbacks=self.top_model_callbacks,
                verbose=0)
            if self.use_test_split:
                test_scores = self.top_model.evaluate(
                    self.test_bottleneck_preds,
                    self.test_bottleneck_target,
                    batch_size=batch_size)
                test_scores = dict(
                    zip(self.top_model.metrics_names, test_scores))
        else:
            self.full_model.layers[0].trainable = False
            self.full_model.compile(optimizer=top_model_optimizer,
                                    loss=loss_fn,
                                    metrics=metrics)
            self.top_model_history = self.full_model.fit(
                x=self.train_ds,
                validation_data=self.valid_ds,
                epochs=epochs,
                # Shuffling in dataset
                shuffle=False,
                callbacks=self.top_model_callbacks,
                verbose=0)
            if self.use_test_split:
                test_scores = self.full_model.evaluate(self.test_ds)
                test_scores = dict(
                    zip(self.full_model.metrics_names, test_scores))

        history_data = {
            'history':
            numpy_dict_to_json_format(self.top_model_history.history),
            'model_name': model_name
        }
        if self.use_test_split:
            history_data['test_scores'] = numpy_dict_to_json_format(
                test_scores)
        dump_json(history_data, os.path.join('history', model_name))

        if save_last_model:
            self.full_model.save(os.path.join('models', model_name),
                                 include_optimizer=False)
예제 #8
0
def run(args):
    start_epoch = 1
    best = {'L1': 1e+9, 'MAE': 1e+9}

    # logs
    if args.expid == '':
        args.expid = dt.datetime.now().strftime('%Y%m%d%H%M')
    args.log_dir = os.path.join(args.log_dir, args.expid)
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    os.chmod(args.log_dir, 0o0777)
    logger = get_logger(os.path.join(args.log_dir, 'main.log'))
    logger.info(args)
    writer = SummaryWriter(args.log_dir)

    args.device = torch.device(
        'cuda:0' if torch.cuda.is_available() else 'cpu')

    # data
    if args.trainset == 'trainset':
        train_set = WCTrainset(args.data_root, args.train_csv, args=args)
    else:
        train_set = WCDataset(args.data_root, args.train_csv, args=args)
    valid_set = WCValidset(args.data_root, args.valid_csv, args=args)
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=True)
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=False)

    # network
    model = models.__dict__[args.model](args=args)
    if torch.cuda.device_count() > 1:
        logger.info('{} GPUs found.'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    model = model.to(args.device)
    # training
    criterion, valid_loss_fn = get_loss_fn(args)
    optimizer = get_optimizer(model, args.optim_str)
    scheduler = get_scheduler(optimizer, args)
    logger.debug(optimizer)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch'] + 1
            best['L1'] = checkpoint['best/L1']
            best['MAE'] = checkpoint['best/MAE']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info('Loaded checkpoint {} (epoch {})'.format(
                args.resume, start_epoch - 1))
        else:
            raise IOError('No such file {}'.format(args.resume))

    for epoch_i in range(start_epoch, args.epochs + 1):
        message = '[{}] Epoch {} Train/{} {:.2f} /MAE {:.4f} Valid/L1 {:.2f} /MAE {:.4f} (Best {:.4f}) '  # noqa
        for param_group in optimizer.param_groups:
            message += 'LR {:.4f} '.format(param_group['lr'])

        training = train(train_loader,
                         model,
                         criterion,
                         optimizer,
                         logger=logger,
                         args=args)
        validation = validate(valid_loader,
                              model,
                              valid_loss_fn,
                              logger=logger,
                              args=args)

        writer.add_scalar('{}/Train'.format(args.loss), training['loss'],
                          epoch_i)
        writer.add_scalar('{}/Valid'.format(args.loss), validation['loss'],
                          epoch_i)
        writer.add_scalar('MAE/Train', training['mae'], epoch_i)
        writer.add_scalar('MAE/Valid', validation['mae'], epoch_i)
        writer.add_scalar('Grad/L2/Mean/BeforeClipped/Train',
                          training['grad/L2/BeforeClipped'], epoch_i)
        writer.add_scalar('Grad/L2/Mean/Clipped/Train',
                          training['grad/L2/Clipped'], epoch_i)
        writer.add_scalar('Grad/L2/Mean/Train', training['grad/L2/Mean'],
                          epoch_i)
        if epoch_i % args.freq_to_log_image == 0:
            writer.add_image('Train/Predict',
                             _get_images(training['pred'], args), epoch_i)
            writer.add_image('Train/Target',
                             _get_images(training['true'], args), epoch_i)
            writer.add_image('Valid/Predict',
                             _get_images(validation['pred'], args), epoch_i)
            writer.add_image('Valid/Target',
                             _get_images(validation['true'], args), epoch_i)

        is_best = (validation['mae'] < best['MAE'],
                   validation['loss'] < best['L1'])
        if is_best[0]:
            best['MAE'] = validation['mae']
        if is_best[1]:
            best['L1'] = validation['loss']
        save_checkpoint(
            {
                'epoch': epoch_i,
                'state_dict': model.state_dict(),
                'valid/L1': validation['loss'],
                'valid/MAE': validation['mae'],
                'best/L1': best['L1'],
                'best/MAE': best['MAE'],
                'optimizer': optimizer.state_dict(),
            }, is_best, args.log_dir)

        if scheduler is not None:
            scheduler.step(epoch=epoch_i)

        message = message.format(args.expid, epoch_i, args.loss,
                                 training['loss'], training['mae'],
                                 validation['loss'], validation['mae'],
                                 best['MAE'])
        logger.info(message)
예제 #9
0
def main(args):
    start_epoch = 1
    best_loss = 1e+6

    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    os.chmod(args.log_dir, 0o0777)
    logger = get_logger(os.path.join(args.log_dir, 'main.log'))
    logger.info(args)

    writer = SummaryWriter(args.log_dir)

    # data
    train_set = MovingMNIST(root='./data/train', train=True, download=True)
    test_set = MovingMNIST(root='./data/test', train=False, download=True)
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_set,
                             batch_size=args.batch_size,
                             shuffle=False)

    # network
    model = models.__dict__[args.model](args=args)
    model = nn.DataParallel(model)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    # training
    criterion = get_loss_fn(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch'] + 1
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info('Loaded checkpoint {} (epoch {})'.format(
                args.resume, start_epoch - 1))
        else:
            raise IOError('No such file {}'.format(args.resume))

    for epoch_i in range(start_epoch, args.epochs + 1):
        model.train()
        losses = 0.
        for i, (inputs, targets) in enumerate(train_loader):
            bs, ts, h, w = targets.size()
            inputs = inputs.unsqueeze(2)
            inputs, targets = inputs.float() / 255., targets.float() / 255.
            inputs, targets = inputs.to(args.device), targets.to(args.device)
            outputs = model(inputs)

            # (bs ,ts, c, h, w) -> (bs, ts, h, w) -> (ts, bs, h, w)
            outputs = outputs.squeeze(2).permute(1, 0, 2, 3)
            # (bs, ts, h, w) -> (ts, bs, h, w)
            targets = targets.permute(1, 0, 2, 3)
            loss = 0.
            for t_i in range(ts):
                loss += criterion(outputs[t_i], targets[t_i]) / bs

            losses += loss.item() * bs

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            logger.debug('Train/Batch {}/{}'.format(i + 1, len(train_loader)))

        model.eval()
        test_losses = 0.
        for i, (inputs, targets) in enumerate(test_loader):
            bs, ts, h, w = targets.size()
            inputs = inputs.unsqueeze(2)
            inputs, targets = inputs.float() / 255., targets.float() / 255.
            inputs, targets = inputs.to(args.device), targets.to(args.device)
            with torch.no_grad():
                outputs = model(inputs)
                # (bs ,ts, c, h, w) -> (bs, ts, h, w) -> (ts, bs, h, w)
                outputs = outputs.squeeze(2).permute(1, 0, 2, 3)
                # (bs, ts, h, w) -> (ts, bs, h, w)
                targets = targets.permute(1, 0, 2, 3)
                loss = 0.
                for t_i in range(ts):
                    loss += criterion(outputs[t_i], targets[t_i]) / bs
            test_losses += loss.item() * bs
            logger.debug('Test/Batch {}/{}'.format(i + 1, len(test_loader)))

        train_loss = losses / len(train_set)
        test_loss = test_losses / len(test_set)
        writer.add_scalar('Train/{}'.format(args.loss), train_loss, epoch_i)
        writer.add_scalar('Test/{}'.format(args.loss), test_loss, epoch_i)
        logger.info('Epoch {} Train/Loss {:.4f} Test/Loss {:.4f}'.format(
            epoch_i, train_loss, test_loss))

        is_best = test_loss < best_loss
        if test_loss < best_loss:
            best_loss = test_loss
        save_checkpoint(
            {
                'epoch': epoch_i,
                'state_dict': model.state_dict(),
                'test_loss': test_loss,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.log_dir)

        if scheduler is not None:
            scheduler.step()
def run(args):
    start_epoch = 1
    best_loss = 1e+9

    # logs
    args.logdir = get_logdir(args)
    logger = get_logger(os.path.join(args.logdir, 'main.log'))
    logger.info(args)
    writer = SummaryWriter(args.logdir)

    # data
    train_set = MovingMNIST(root='./data', train=True, download=True)
    valid_set = MovingMNIST(root='./data',
                            train=False,
                            download=True,
                            split=args.test_size)
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=True)
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=False)

    # network
    model = models.__dict__[args.model](args=args)
    model = nn.DataParallel(model)
    args.device = torch.device(
        'cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(args.device)
    # training
    criterion = get_loss_fn(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch'] + 1
            best_loss = checkpoint['best/{}'.format(args.loss)]
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info('Loaded checkpoint {} (epoch {})'.format(
                args.resume, start_epoch - 1))
        else:
            raise IOError('No such file {}'.format(args.resume))

    for epoch_i in range(start_epoch, args.epochs + 1):
        training = train(train_loader,
                         model,
                         criterion,
                         optimizer,
                         logger=logger,
                         args=args)
        validation = validate(valid_loader,
                              model,
                              criterion,
                              logger=logger,
                              args=args)

        writer.add_scalar('Train/{}'.format(args.loss), training[args.loss],
                          epoch_i)
        writer.add_scalar('Valid/{}'.format(args.loss), validation[args.loss],
                          epoch_i)
        writer.add_image('Train/Predict', _get_images(training['output'],
                                                      args), epoch_i)
        writer.add_image('Train/Target', _get_images(training['target'], args),
                         epoch_i)
        writer.add_image('Valid/Predict',
                         _get_images(validation['output'], args), epoch_i)
        writer.add_image('Valid/Target', _get_images(validation['target'],
                                                     args), epoch_i)

        message = '[{}] Epoch {} Train/{} {:.4f} Valid/{} {:.4f} '
        message = message.format(
            args.expid,
            epoch_i,
            args.loss,
            training[args.loss],
            args.loss,
            validation[args.loss],
        )

        is_best = validation[args.loss] < best_loss
        if is_best:
            best_loss = validation[args.loss]
            message += '(Best)'
        save_checkpoint(
            {
                'epoch': epoch_i,
                'state_dict': model.state_dict(),
                'valid/{}'.format(args.loss): validation[args.loss],
                'best/{}'.format(args.loss): best_loss,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.logdir)

        if scheduler is not None:
            scheduler.step(epoch=epoch_i)
            logger.debug('Scheduler stepped.')
            for param_group in optimizer.param_groups:
                logger.debug(param_group['lr'])

        logger.info(message)