Exemplo n.º 1
0
def save_checkpoint(net,
                    checkpoints_path,
                    epoch=None,
                    prefix='',
                    verbose=True):
    if epoch is None:
        checkpoint_name = 'last_checkpoint.params'
    else:
        checkpoint_name = f'{epoch:03d}.params'

    if prefix:
        checkpoint_name = f'{prefix}_{checkpoint_name}'

    if not checkpoints_path.exists():
        checkpoints_path.mkdir(parents=True)

    checkpoint_path = checkpoints_path / checkpoint_name
    if verbose:
        logger.info(f'Save checkpoint to {str(checkpoint_path)}')

    torch.save(net.state_dict(), str(checkpoint_path))
Exemplo n.º 2
0
def init_experiment(experiment_name, add_exp_args, script_path=None):
    parser = get_train_arguments()
    parser = add_exp_args(parser)
    args = parser.parse_args()

    experiments_path = Path('./experiments') / experiment_name
    experiments_path.mkdir(parents=True, exist_ok=True)

    exp_indx = find_last_exp_indx(experiments_path)
    experiment_name = f'{exp_indx:03d}'
    if args.exp_name:
        experiment_name += f'_{args.exp_name}'

    experiment_path = experiments_path / experiment_name

    args.logs_path = experiment_path / 'logs'
    args.run_path = experiment_path
    args.checkpoints_path = experiment_path / 'checkpoints'

    experiment_path.mkdir(parents=True)
    if script_path is not None:
        temp_script_name = Path(script_path).stem + datetime.strftime(
            datetime.today(), '_%Y-%m-%d_%H-%M-%S.py')
        shutil.copy(script_path, experiment_path / temp_script_name)

    if not args.checkpoints_path.exists():
        args.checkpoints_path.mkdir(parents=True)
    if not args.logs_path.exists():
        args.logs_path.mkdir(parents=True)

    stdout_log_path = args.logs_path / 'train_log.txt'

    if stdout_log_path is not None:
        fh = logging.FileHandler(str(stdout_log_path))
        formatter = logging.Formatter(
            fmt='(%(levelname)s) %(asctime)s: %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S')
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    if args.no_cuda:
        logger.info('Using CPU')
        args.device = torch.device('cpu')
    else:
        if args.gpus:
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.gpus.split(',')[0]}"
        args.device = torch.device(f'cuda:{0}')
        args.ngpus = 1
        logger.info(f'Number of GPUs: {args.ngpus}')

        if args.ngpus < 2:
            args.syncbn = False

    logger.info(args)

    return args
Exemplo n.º 3
0
    def __init__(self,
                 args,
                 model,
                 model_cfg,
                 loss_cfg,
                 trainset,
                 valset,
                 optimizer_params,
                 image_dump_interval=200,
                 checkpoint_interval=10,
                 tb_dump_period=25,
                 num_epochs=1,
                 lr_scheduler=None,
                 metrics=None,
                 additional_val_metrics=None,
                 train_proposals=False):
        self.args = args
        self.model_cfg = model_cfg
        self.loss_cfg = loss_cfg
        self.val_loss_cfg = deepcopy(loss_cfg)
        self.tb_dump_period = tb_dump_period

        self.train_metrics = metrics if metrics is not None else []
        self.val_metrics = deepcopy(self.train_metrics)
        if additional_val_metrics is not None:
            self.val_metrics.extend(additional_val_metrics)

        self.checkpoint_interval = checkpoint_interval
        self.image_dump_interval = image_dump_interval
        self.train_proposals = train_proposals
        self.task_prefix = ''
        self.summary_writer = None

        self.trainset = trainset
        self.valset = valset
        self.train_loader = DataLoader(trainset,
                                       batch_size=args.batch_size,
                                       pin_memory=True,
                                       shuffle=True,
                                       num_workers=args.workers,
                                       drop_last=True)
        self.val_loader = DataLoader(valset,
                                     batch_size=args.val_batch_size,
                                     pin_memory=True,
                                     shuffle=False,
                                     num_workers=args.workers,
                                     drop_last=True)

        self.device = torch.device(args.device)
        log.logger.info(model)
        if not args.no_cuda:
            self.net = model.to(self.device)

        self.evaluator = None
        self._load_weights()

        if train_proposals:
            self.task_prefix = 'proposals'
        self.optim = torch.optim.Adam(self.net.get_trainable_params(),
                                      **optimizer_params)
        self.tqdm_out = log.TqdmToLogger(log.logger, level=log.logging.INFO)

        self.lr_scheduler = None
        self.lr = optimizer_params['lr']
        if lr_scheduler is not None:
            self.lr_scheduler = lr_scheduler(optimizer=self.optim,
                                             T_max=num_epochs *
                                             len(self.train_loader))
            if args.start_epoch > 0:
                for _ in range(args.start_epoch):
                    self.lr_scheduler.step()

        if args.input_normalization:
            mean = torch.tensor(args.input_normalization['mean'],
                                dtype=torch.float32)
            std = torch.tensor(args.input_normalization['std'],
                               dtype=torch.float32)

            self.denormalizator = Normalize((-mean / std), (1.0 / std))
        else:
            self.denormalizator = lambda x: x

        if len(args.gpus.split(",")) > 1:
            logger.info("could use {} gpus.".format(torch.cuda.device_count()))
            assert args.batch_size % torch.cuda.device_count(
            ) == 0, "batch size should be divided by device count"
            self.net = torch.nn.DataParallel(self.net)

        self.epoch_loss = AverageMeter()
        self.best_loss = 2.0
Exemplo n.º 4
0
    def training(self, epoch):
        if self.summary_writer is None:
            self.summary_writer = log.SummaryWriterAvg(
                log_dir=str(self.args.logs_path),
                flush_secs=10,
                dump_period=self.tb_dump_period)

        log_prefix = 'Train' + self.task_prefix.capitalize()
        tbar = tqdm(self.train_loader, file=self.tqdm_out, ncols=100)
        train_loss = 0.0

        for metric in self.train_metrics:
            metric.reset_epoch_stats()

        for i, batch_data in enumerate(tbar):
            global_step = epoch * len(self.train_loader) + i

            loss, losses_logging, batch_data, outputs = self.batch_forward_parallelloss(
                batch_data)

            self.optim.zero_grad()
            loss.backward()
            self.optim.step()
            if self.lr_scheduler is not None:
                self.lr_scheduler.step()

            loss = loss.detach().cpu().numpy().mean()
            train_loss += loss

            for loss_name, loss_values in losses_logging.items():
                self.summary_writer.add_scalar(
                    tag=f'{log_prefix}Losses/{loss_name}',
                    value=np.array(loss_values).mean(),
                    global_step=global_step)
            self.summary_writer.add_scalar(tag=f'{log_prefix}Losses/overall',
                                           value=loss,
                                           global_step=global_step)

            for k, v in self.loss_cfg.items():
                if '_loss' in k and hasattr(
                        v, 'log_states') and self.loss_cfg.get(
                            k + '_weight', 0.0) > 0:
                    v.log_states(self.summary_writer,
                                 f'{log_prefix}Losses/{k}', global_step)

            if self.image_dump_interval > 0 and global_step % self.image_dump_interval == 0:
                self.save_visualization(batch_data,
                                        outputs,
                                        global_step,
                                        prefix='train')

            self.summary_writer.add_scalar(
                tag=f'{log_prefix}States/learning_rate',
                value=self.lr
                if self.lr_scheduler is None else self.lr_scheduler.get_lr(),
                global_step=global_step)

            tbar.set_description(
                f'Epoch {epoch}, training loss {train_loss / (i + 1):.6f}')
            for metric in self.train_metrics:
                metric.log_states(self.summary_writer,
                                  f'{log_prefix}Metrics/{metric.name}',
                                  global_step)
            self.epoch_loss.update(loss.item(),
                                   batch_data['instances'].size(0))

        for metric in self.train_metrics:
            self.summary_writer.add_scalar(
                tag=f'{log_prefix}Metrics/{metric.name}',
                value=metric.get_epoch_value(),
                global_step=epoch,
                disable_avg=True)

        misc.save_checkpoint(self.net,
                             self.args.checkpoints_path,
                             prefix=self.task_prefix,
                             epoch=None)
        if epoch % self.checkpoint_interval == 0:
            misc.save_checkpoint(self.net,
                                 self.args.checkpoints_path,
                                 prefix=self.task_prefix,
                                 epoch=epoch)

        model_state_dic = self.net.module.state_dict()  # DataParallel
        if self.epoch_loss.get_avg() < self.best_loss:
            self.best_loss = self.epoch_loss.get_avg()
            logger.info("save best loss model epoch {}".format(epoch))
            torch.save(
                model_state_dic,
                os.path.join(
                    self.args.checkpoints_path,
                    'ep-{}-loss-{}_model.pth'.format(epoch, self.best_loss)))
Exemplo n.º 5
0
def train(model, model_cfg, args, train_proposals, start_epoch=0):
    loss_cfg = edict()
    loss_cfg.instance_loss = NormalizedFocalLossSigmoid(alpha=0.50, gamma=2)
    loss_cfg.instance_loss_weight = 1.0 if not train_proposals else 0.0

    if not train_proposals:
        num_epochs = 160
        num_points = 12

        loss_cfg.segmentation_loss = NormalizedFocalLossSoftmax(
            ignore_label=-1, gamma=1)
        loss_cfg.segmentation_loss_weight = 0.75
    else:
        num_epochs = 10
        num_points = 32

        loss_cfg.proposals_loss = AdaptISProposalsLossIoU(args.batch_size)
        loss_cfg.proposals_loss_weight = 1.0

    args.val_batch_size = args.batch_size
    args.input_normalization = model_cfg.input_normalization

    train_augmentator = Compose([Flip()], p=1.0)

    trainset = ToyDataset(args.dataset_path,
                          split='train',
                          num_points=num_points,
                          augmentator=train_augmentator,
                          with_segmentation=True,
                          points_from_one_object=train_proposals,
                          input_transform=model_cfg.input_transform,
                          epoch_len=10000)

    valset = ToyDataset(args.dataset_path,
                        split='test',
                        augmentator=None,
                        num_points=num_points,
                        with_segmentation=True,
                        points_from_one_object=train_proposals,
                        input_transform=model_cfg.input_transform)

    optimizer_params = {
        'learning_rate': 5e-4,
        'beta1': 0.9,
        'beta2': 0.999,
        'epsilon': 1e-8
    }

    if not train_proposals:
        lr_scheduler = partial(LRScheduler,
                               mode='cosine',
                               baselr=optimizer_params['learning_rate'],
                               nepochs=num_epochs)
    else:
        lr_scheduler = partial(LRScheduler,
                               mode='cosine',
                               baselr=optimizer_params['learning_rate'],
                               nepochs=num_epochs)

    trainer = AdaptISTrainer(
        args,
        model,
        model_cfg,
        loss_cfg,
        trainset,
        valset,
        optimizer='adam',
        optimizer_params=optimizer_params,
        lr_scheduler=lr_scheduler,
        checkpoint_interval=40 if not train_proposals else 5,
        image_dump_interval=200 if not train_proposals else -1,
        train_proposals=train_proposals,
        hybridize_model=not train_proposals,
        metrics=[AdaptiveIoU()])

    logger.info(f'Starting Epoch: {start_epoch}')
    logger.info(f'Total Epochs: {num_epochs}')
    for epoch in range(start_epoch, num_epochs):
        trainer.training(epoch)
        trainer.validation(epoch)
Exemplo n.º 6
0
def train(model, model_cfg, args, train_proposals, start_epoch=0):
    args.val_batch_size = args.batch_size
    args.input_normalization = model_cfg.input_normalization
    crop_size = model_cfg.crop_size

    loss_cfg = edict()
    loss_cfg.instance_loss = NormalizedFocalLossSigmoid(alpha=0.25, gamma=2)
    loss_cfg.instance_loss_weight = 1.0 if not train_proposals else 0.0

    if not train_proposals:
        num_epochs = 250
        num_points = 6

        loss_cfg.segmentation_loss = NormalizedFocalLossSoftmax(
            ignore_label=-1, gamma=1)
        loss_cfg.segmentation_loss_weight = 0.75
    else:
        num_epochs = 8
        num_points = 48

        loss_cfg.proposals_loss = AdaptISProposalsLossIoU(args.batch_size)
        loss_cfg.proposals_loss_weight = 1.0

    train_augmentator = Compose([
        HorizontalFlip(),
        ShiftScaleRotate(shift_limit=0.03,
                         scale_limit=0,
                         rotate_limit=(-3, 3),
                         border_mode=0,
                         p=0.75),
        PadIfNeeded(
            min_height=crop_size[0], min_width=crop_size[1], border_mode=0),
        RandomCrop(*crop_size),
        RandomBrightness(limit=(-0.25, 0.25), p=0.75),
        RandomContrast(limit=(-0.15, 0.4), p=0.75),
        RGBShift(r_shift_limit=10, g_shift_limit=10, b_shift_limit=10, p=0.75)
    ],
                                p=1.0)

    val_augmentator = Compose([
        PadIfNeeded(
            min_height=crop_size[0], min_width=crop_size[1], border_mode=0),
        RandomCrop(*crop_size)
    ],
                              p=1.0)

    def scale_func(image_shape):
        return random.uniform(0.85, 1.15)

    trainset = CityscapesDataset(args.dataset_path,
                                 split='train',
                                 num_points=num_points,
                                 augmentator=train_augmentator,
                                 with_segmentation=True,
                                 points_from_one_object=train_proposals,
                                 input_transform=model_cfg.input_transform,
                                 min_object_area=80,
                                 sample_ignore_object_prob=0.025,
                                 keep_background_prob=0.05,
                                 image_rescale=scale_func,
                                 use_jpeg=False)

    valset = CityscapesDataset(args.dataset_path,
                               split='test',
                               augmentator=val_augmentator,
                               num_points=num_points,
                               with_segmentation=True,
                               points_from_one_object=train_proposals,
                               input_transform=model_cfg.input_transform,
                               min_object_area=80,
                               image_rescale=scale_func,
                               use_jpeg=False)

    if not train_proposals:
        optimizer_params = {'learning_rate': 0.01, 'momentum': 0.9, 'wd': 1e-4}
        lr_scheduler = partial(LRScheduler,
                               mode='poly',
                               baselr=optimizer_params['learning_rate'],
                               nepochs=num_epochs)
    else:
        optimizer_params = {
            'learning_rate': 5e-4,
            'beta1': 0.9,
            'beta2': 0.999,
            'epsilon': 1e-8
        }
        lr_scheduler = partial(LRScheduler,
                               mode='cosine',
                               baselr=optimizer_params['learning_rate'],
                               nepochs=num_epochs)

    trainer = AdaptISTrainer(
        args,
        model,
        model_cfg,
        loss_cfg,
        trainset,
        valset,
        optimizer='sgd' if not train_proposals else 'adam',
        optimizer_params=optimizer_params,
        lr_scheduler=lr_scheduler,
        checkpoint_interval=40 if not train_proposals else 2,
        image_dump_interval=100 if not train_proposals else -1,
        train_proposals=train_proposals,
        hybridize_model=not train_proposals,
        metrics=[AdaptiveIoU()])

    logger.info(f'Starting Epoch: {start_epoch}')
    logger.info(f'Total Epochs: {num_epochs}')
    for epoch in range(start_epoch, num_epochs):
        trainer.training(epoch)
        trainer.validation(epoch)
Exemplo n.º 7
0
    def __init__(self,
                 args,
                 model,
                 model_cfg,
                 loss_cfg,
                 trainset,
                 valset,
                 optimizer_params,
                 optimizer='adam',
                 image_dump_interval=200,
                 checkpoint_interval=10,
                 tb_dump_period=25,
                 lr_scheduler=None,
                 metrics=None,
                 additional_val_metrics=None,
                 train_proposals=False,
                 hybridize_model=True):
        self.args = args
        self.model_cfg = model_cfg
        self.loss_cfg = loss_cfg
        self.val_loss_cfg = deepcopy(loss_cfg)
        self.tb_dump_period = tb_dump_period

        if metrics is None:
            metrics = []
        self.train_metrics = metrics
        self.val_metrics = deepcopy(metrics)
        if additional_val_metrics is not None:
            self.val_metrics.extend(additional_val_metrics)

        self.hybridize_model = hybridize_model
        self.checkpoint_interval = checkpoint_interval
        self.train_proposals = train_proposals
        self.task_prefix = ''

        self.trainset = trainset
        self.valset = valset

        self.train_data = gluon.data.DataLoader(
            trainset,
            args.batch_size,
            shuffle=True,
            last_batch='rollover',
            batchify_fn=get_dict_batchify_fn(args.workers),
            thread_pool=args.thread_pool,
            num_workers=args.workers)

        self.val_data = gluon.data.DataLoader(valset,
                                              args.val_batch_size,
                                              batchify_fn=get_dict_batchify_fn(
                                                  args.workers),
                                              last_batch='rollover',
                                              thread_pool=args.thread_pool,
                                              num_workers=args.workers)

        logger.info(model)
        model.cast(args.dtype)
        model.collect_params().reset_ctx(ctx=args.ctx)

        self.net = model
        self.evaluator = None
        if args.weights is not None:
            if os.path.isfile(args.weights):
                model.load_parameters(args.weights,
                                      ctx=args.ctx,
                                      allow_missing=True)
                args.weights = None
            else:
                raise RuntimeError(
                    f"=> no checkpoint found at '{args.weights}'")

        self.lr_scheduler = None
        if lr_scheduler is not None:
            self.lr_scheduler = lr_scheduler(niters=len(self.train_data))
            optimizer_params['lr_scheduler'] = self.lr_scheduler

        kv = mx.kv.create(args.kvstore)
        if not train_proposals:
            train_params = self.net.collect_params()
        else:
            train_params = self.net.proposals_head.collect_params()
            self.task_prefix = 'proposals'

        self.trainer = gluon.Trainer(train_params,
                                     optimizer,
                                     optimizer_params,
                                     kvstore=kv,
                                     update_on_kvstore=len(args.ctx) > 1)

        self.tqdm_out = TqdmToLogger(logger, level=logging.INFO)
        if args.input_normalization:
            self.denormalizator = DeNormalize(args.input_normalization['mean'],
                                              args.input_normalization['std'])
        else:
            self.denormalizator = lambda x: x

        self.sw = None
        self.image_dump_interval = image_dump_interval