Пример #1
0
def single_gpu_test(model, dataset, show=False):
    # create a loader for this runner
    tf_dataset, num_examples = build_dataloader(dataset,
                                                1,
                                                1,
                                                num_gpus=1,
                                                dist=False)
    results = []
    start = time.time()
    for i, data_batch in enumerate(tf_dataset):
        if i >= num_examples:
            break
        _, img_meta = data_batch
        print(dataset.img_ids[i])
        outputs = model(data_batch, training=False)
        bboxes = outputs['bboxes']
        # # map boxes back to original scale
        bboxes = transforms.bbox_mapping_back(bboxes, img_meta)
        # # print('>>>>', bboxes)
        labels = outputs['labels']
        scores = outputs['scores']
        result = transforms.bbox2result(bboxes, labels, scores, num_classes=81)
        #for b, l, s in zip(bboxes, labels, scores):
        #    print(b, l, s)
        #print(result)
        results.append(result)
    print("Forward pass through test set took {}s".format(time.time() - start))
    evaluate(dataset, results)
    return results
def main(cfg):
    ######################################################################################
    # Create Training Data
    ######################################################################################
    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    model.CLASSES = datasets.CLASSES
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)
    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=cfg.optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=None))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          interval=10,
                                          image_interval=100,
                                          s3_dir=None))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
Пример #3
0
 def after_train_epoch(self, runner):
     if not self.every_n_epochs(runner, self.interval):
         return
     # create a loader for this runner
     tf_dataset, num_examples = build_dataloader(self.dataset,
                                                 1,
                                                 1,
                                                 num_gpus=runner.local_size,
                                                 dist=True)
     # num_examples=8
     results = [None for _ in range(num_examples * runner.local_size)
                ]  # REVISIT - may require a lot of memory
     #if runner.model.mask:
     if self.dataset.mask:
         masks = [None for _ in range(num_examples * runner.local_size)]
     if runner.rank == 0:
         prog_bar = ProgressBar(num_examples)
     for i, data_batch in enumerate(tf_dataset):
         if i >= num_examples:
             break
         _, img_meta = data_batch
         outputs = runner.model(data_batch, training=False)
         assert isinstance(outputs, dict)
         bboxes = outputs['bboxes']
         # map boxes back to original scale
         bboxes = transforms.bbox_mapping_back(bboxes, img_meta)
         labels = outputs['labels']
         scores = outputs['scores']
         result = transforms.bbox2result(bboxes,
                                         labels,
                                         scores,
                                         num_classes=self.dataset.CLASSES +
                                         1)  # add background class
         #if runner.model.mask:
         if self.dataset.mask:
             mask = mask2result(outputs['masks'], labels, img_meta[0])
             results[i * runner.local_size + runner.local_rank] = (result,
                                                                   mask)
         else:
             results[i * runner.local_size + runner.local_rank] = result
         if runner.rank == 0:
             prog_bar.update()
     # write to a file
     tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank))
     if runner.rank != 0:
         dump(results, tmp_file)
         # open(tmp_file+'.done', 'w').close()
     # MPI barrier through horovod
     _ = get_barrier()
     self._accumulate_results(runner, results, num_examples)
Пример #4
0
def _dist_train(model,
                dataset,
                cfg,
                num_gpus=1,
                mixed_precision=False,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    tf_datasets = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         1,
                         num_gpus=num_gpus,
                         dist=True) for ds in dataset
    ]

    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    if mixed_precision:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')

    optimizer_config = cfg.optimizer_config
    optimizer_config['amp_enabled'] = mixed_precision
    gradient_clip = optimizer_config.get('gradient_clip', 15.0) # default is 15.0

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    amp_enabled=mixed_precision,
                    gradient_clip=gradient_clip)
 
    runner.timestamp = timestamp
    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
Пример #5
0
def _dist_train(model,
                dataset,
                cfg,
                num_gpus=1,
                mixed_precision=False,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    tf_datasets = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         1,
                         num_gpus=num_gpus,
                         dist=True) for ds in dataset
    ]

    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    if mixed_precision:
        # broken in TF 2.1
        # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale='dynamic')

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    amp_enabled=mixed_precision)
    # workaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    optimizer_config = cfg.optimizer_config
    optimizer_config['amp_enabled'] = mixed_precision
    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # register eval hooks
    if validate and runner.rank < runner.local_size:  # register this dist eval hook only for Node 0
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
Пример #6
0
 def __init__(self,
              dataset_cfg,
              interval=1000,
              threshold=0.75,
              figsize=(8, 8),
              top_k=10):
     self.dataset = datasets.build_dataset(dataset_cfg)
     self.tf_dataset, self.num_examples = datasets.build_dataloader(
         self.dataset, 1, 1, num_gpus=1, dist=False)
     self.tf_dataset = iter(
         self.tf_dataset.prefetch(16).shuffle(4).repeat())
     self.interval = interval
     self.img_mean = dataset_cfg.mean
     self.threshold = threshold
     self.figsize = figsize
     self.top_k = top_k
     self.threads = ThreadPoolExecutor()
Пример #7
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    mixed_precision=False,
                    validate=False,
                    logger=None,
                    timestamp=None):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    tf_datasets = [
        build_dataloader(ds, cfg.data.imgs_per_gpu, 1, dist=False)
        for ds in dataset
    ]

    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    # broken in TF2.1
    # if mixed_precision:
    #     optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 1024.0) # "dynamic")

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger)
    # workaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    optimizer_config = cfg.optimizer_config
    optimizer_config['amp_enabled'] = mixed_precision
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
Пример #8
0
 def __init__(self,
              dataset_cfg,
              interval=1000,
              threshold=0.75,
              figsize=(8, 8),
              top_k=10,
              run_on_sagemaker=False):
     if run_on_sagemaker:
         # update paths for SM
         import os, pathlib
         data_root = pathlib.Path(
             os.getenv('SM_CHANNEL_COCO')).joinpath('coco').as_posix()
         dataset_cfg['dataset_dir'] = data_root
     self.dataset = datasets.build_dataset(dataset_cfg)
     self.tf_dataset, self.num_examples = datasets.build_dataloader(
         self.dataset, 1, 1, num_gpus=1, dist=False)
     self.tf_dataset = iter(
         self.tf_dataset.prefetch(16).shuffle(4).repeat())
     self.interval = interval
     self.img_mean = dataset_cfg.mean
     self.threshold = threshold
     self.figsize = figsize
     self.top_k = top_k
     self.threads = ThreadPoolExecutor()
Пример #9
0
def main(cfg):
    decompress_data(cfg)
    ######################################################################################
    # Create Training Data
    ######################################################################################
    cfg.global_batch_size = cfg.batch_size_per_device * hvd.size()
    cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size

    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################

    #update any hyperparams that we may have passed in via arguments
    if cfg.ls > 0.0:
        cfg.model['bbox_head']['label_smoothing'] = cfg.ls
    if cfg.use_rcnn_bn:
        cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn
    if cfg.use_conv:
        cfg.model['bbox_head']['use_conv'] = cfg.use_conv

    cfg.schedule = args.schedule
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)

    ######################################################################################
    # Build optimizer and associate scheduler
    ######################################################################################

    # base learning rate is set for global batch size of 8, with linear scaling for larger batches
    base_learning_rate = cfg.base_learning_rate
    scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8
    steps_per_epoch = cfg.steps_per_epoch
    if cfg.schedule == '1x':
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            [steps_per_epoch * 8, steps_per_epoch * 10], [
                scaled_learning_rate, scaled_learning_rate * 0.1,
                scaled_learning_rate * 0.01
            ])
    elif cfg.schedule == 'cosine':
        scheduler = tf.keras.experimental.CosineDecayRestarts(
            initial_learning_rate=scaled_learning_rate,
            first_decay_steps=12 * steps_per_epoch,
            t_mul=1,
            m_mul=1)  #0-1-13
    else:
        raise NotImplementedError
    warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate
    scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps)
    # FIXME: currently hardcoded to SGD
    optimizer = tf.keras.optimizers.SGD(scheduler,
                                        momentum=0.9,
                                        nesterov=False)
    if cfg.fp16:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale='dynamic')

    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=cfg.s3_checkpoints,
                                  h5=True))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          image_interval=100,
                                          s3_dir=cfg.s3_tensorboard))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)