示例#1
0
def main():
    args = parse_args()

    # load the configuration
    # import on-the-fly to avoid overwriting cfg
    from common.config import purge_cfg
    from mvpnet.config.mvpnet_3d import cfg
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    purge_cfg(cfg)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    # replace '@' with config path
    if output_dir:
        config_path = osp.splitext(args.config_file)[0]
        output_dir = output_dir.replace(
            '@', config_path.replace('configs', 'outputs'))
        if not osp.isdir(output_dir):
            warnings.warn('Make a new directory: {}'.format(output_dir))
            os.makedirs(output_dir)

    # run name
    timestamp = time.strftime('%m-%d_%H-%M-%S')
    hostname = socket.gethostname()
    run_name = '{:s}.{:s}'.format(timestamp, hostname)

    logger = setup_logger('mvpnet',
                          output_dir,
                          comment='test.{:s}'.format(run_name))
    logger.info('{:d} GPUs available'.format(torch.cuda.device_count()))
    logger.info(args)

    from common.utils.misc import collect_env_info
    logger.info('Collecting env info (might take some time)\n' +
                collect_env_info())

    logger.info('Loaded configuration file {:s}'.format(args.config_file))
    logger.info('Running with config:\n{}'.format(cfg))

    assert cfg.TASK == 'mvpnet_3d'
    test(cfg, args, output_dir, run_name)
示例#2
0
def main():
    args = parse_args()

    # load the configuration
    # import on-the-fly to avoid overwriting cfg
    from common.config import purge_cfg
    from mvpnet.config.sem_seg_2d import cfg
    # cfg.merge_from_file(args.config_file)
    cfg.merge_from_file('/home/dchangyu/mvpnet/configs/scannet/unet_resnet34.yaml')
    cfg.merge_from_list(args.opts)
    purge_cfg(cfg)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    # replace '@' with config path
    if output_dir:
        # config_path = osp.splitext(args.config_file)[0]
        config_path = '/home/dchangyu/mvpnet/configs/scannet/unet_resnet34.yaml'
        output_dir = output_dir.replace('@', config_path.replace('configs', 'outputs'))
        if osp.isdir(output_dir):
            warnings.warn('Output directory exists.')
        os.makedirs(output_dir, exist_ok=True)

    # run name
    timestamp = time.strftime('%m-%d_%H-%M-%S')
    hostname = socket.gethostname()
    run_name = '{:s}.{:s}'.format(timestamp, hostname)

    logger = setup_logger('mvpnet', output_dir, comment='train.{:s}'.format(run_name))
    logger.info('{:d} GPUs available'.format(torch.cuda.device_count()))
    logger.info(args)

    from common.utils.misc import collect_env_info
    logger.info('Collecting env info (might take some time)\n' + collect_env_info())

    logger.info('Loaded configuration file {:s}'.format(args.config_file))
    logger.info('Running with config:\n{}'.format(cfg))

    assert cfg.TASK == 'sem_seg_2d'
    train(cfg, output_dir, run_name)
示例#3
0
def main():
    # ---------------------------------------------------------------------------- #
    # Setup the experiment
    # ---------------------------------------------------------------------------- #
    args = parse_args()

    # load the configuration
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    purge_cfg(cfg)
    cfg.freeze()

    # run name
    timestamp = time.strftime('%m-%d_%H-%M-%S')
    hostname = socket.gethostname()
    run_name = '{:s}.{:s}'.format(timestamp, hostname)

    output_dir = cfg.OUTPUT_DIR
    # replace '@' with config path
    if output_dir:
        config_path = osp.splitext(args.config_file)[0]
        output_dir = output_dir.replace(
            '@', config_path.replace('configs', 'outputs'))
        if args.dev:
            output_dir = osp.join(output_dir, run_name)
            warnings.warn('Dev mode enabled.')
        if osp.isdir(output_dir):
            warnings.warn('Output directory exists.')
        os.makedirs(output_dir, exist_ok=True)

    logger = setup_logger('train',
                          output_dir,
                          filename='log.train.{:s}.txt'.format(run_name))
    logger.info('{:d} GPUs available'.format(torch.cuda.device_count()))
    logger.info(args)

    from common.utils.collect_env import collect_env_info
    logger.info('Collecting env info (might take some time)\n' +
                collect_env_info())

    logger.info('Loaded configuration file {:s}'.format(args.config_file))
    logger.info('Running with config:\n{}'.format(cfg))

    # ---------------------------------------------------------------------------- #
    # Build models, optimizer, scheduler, checkpointer, etc.
    # ---------------------------------------------------------------------------- #
    # build model
    set_random_seed(cfg.RNG_SEED)
    model = build_model(cfg)
    logger.info('Build model:\n{}'.format(str(model)))

    # Currently only support single-gpu mode
    model = model.cuda()

    # build optimizer
    optimizer = build_optimizer(cfg, model)

    # build lr scheduler
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    # build checkpointer
    # Note that checkpointer will load state_dict of model, optimizer and scheduler.
    checkpointer = CheckpointerV2(model,
                                  optimizer=optimizer,
                                  scheduler=lr_scheduler,
                                  save_dir=output_dir,
                                  logger=logger,
                                  max_to_keep=cfg.TRAIN.MAX_TO_KEEP)
    checkpoint_data = checkpointer.load(cfg.RESUME_PATH,
                                        resume=cfg.AUTO_RESUME,
                                        resume_states=cfg.RESUME_STATES,
                                        strict=cfg.RESUME_STRICT)
    ckpt_period = cfg.TRAIN.CHECKPOINT_PERIOD
    start_iter = checkpoint_data.get('iteration', 0)

    # build data loader
    # Reset the random seed again in case the initialization of models changes the random state.
    set_random_seed(cfg.RNG_SEED)
    train_dataloader = build_gnn_dataloader(cfg, True, start_iter)
    logger.info(train_dataloader.dataset)

    # build metrics
    train_meters = MetricLogger(delimiter='  ')

    def setup_train():
        model.train()
        train_meters.reset()

    # Build tensorboard logger
    summary_writer = None
    if output_dir:
        tb_dir = output_dir
        summary_writer = SummaryWriter(tb_dir, max_queue=64, flush_secs=30)

    # ---------------------------------------------------------------------------- #
    # Setup validation
    # ---------------------------------------------------------------------------- #
    val_period = cfg.VAL.PERIOD
    do_validation = val_period > 0
    if do_validation:
        val_dataloader = build_gnn_dataloader(cfg, training=False)
        logger.info(val_dataloader.dataset)
        val_meters = MetricLogger(delimiter='  ')

        best_metric_name = 'best_{}'.format(cfg.VAL.METRIC)
        best_metric = checkpoint_data.get(best_metric_name, None)

        def setup_validate():
            model.eval()
            val_meters.reset()

    # ---------------------------------------------------------------------------- #
    # Training begins.
    # ---------------------------------------------------------------------------- #
    setup_train()
    max_iter = cfg.TRAIN.MAX_ITER
    logger.info('Start training from iteration {}'.format(start_iter))
    tic = time.time()

    for iteration, data_batch in enumerate(train_dataloader, start_iter):
        cur_iter = iteration + 1
        data_time = time.time() - tic

        # copy data from cpu to gpu
        data_batch = data_batch.to('cuda')

        # forward
        pd_dict = model(data_batch)

        # update losses
        loss_dict = model.compute_losses(
            pd_dict,
            data_batch,
        )
        total_loss = sum(loss_dict.values())

        # It is slightly faster to update metrics and meters before backward
        with torch.no_grad():
            train_meters.update(total_loss=total_loss, **loss_dict)
            model.update_metrics(pd_dict, data_batch, train_meters.metrics)

        # backward
        optimizer.zero_grad()
        total_loss.backward()
        if cfg.OPTIMIZER.MAX_GRAD_NORM > 0:
            # CAUTION: built-in clip_grad_norm_ clips the total norm.
            total_norm = clip_grad_norm_(model.parameters(),
                                         max_norm=cfg.OPTIMIZER.MAX_GRAD_NORM)
        else:
            total_norm = None
        optimizer.step()

        batch_time = time.time() - tic
        train_meters.update(time=batch_time, data=data_time)

        # log
        log_period = cfg.TRAIN.LOG_PERIOD
        if log_period > 0 and (cur_iter % log_period == 0 or cur_iter == 1):
            logger.info(
                train_meters.delimiter.join([
                    'iter: {iter:4d}',
                    '{meters}',
                    'lr: {lr:.2e}',
                    'max mem: {memory:.0f}',
                ]).format(
                    iter=cur_iter,
                    meters=str(train_meters),
                    lr=optimizer.param_groups[0]['lr'],
                    memory=torch.cuda.max_memory_allocated() / (1024.0**2),
                ))

        # summary
        summary_period = cfg.TRAIN.SUMMARY_PERIOD
        if summary_writer is not None and (summary_period > 0
                                           and cur_iter % summary_period == 0):
            keywords = (
                'loss',
                'acc',
            )
            for name, metric in train_meters.metrics.items():
                if all(k not in name for k in keywords):
                    continue
                summary_writer.add_scalar('train/' + name,
                                          metric.result,
                                          global_step=cur_iter)

            # summarize gradient norm
            if total_norm is not None:
                summary_writer.add_scalar('grad_norm',
                                          total_norm,
                                          global_step=cur_iter)

        # ---------------------------------------------------------------------------- #
        # validate for one epoch
        # ---------------------------------------------------------------------------- #
        if do_validation and (cur_iter % val_period == 0
                              or cur_iter == max_iter):
            setup_validate()
            logger.info('Validation begins at iteration {}.'.format(cur_iter))

            start_time_val = time.time()
            tic = time.time()
            for iteration_val, data_batch in enumerate(val_dataloader):
                data_time = time.time() - tic

                # copy data from cpu to gpu
                data_batch = data_batch.to('cuda')

                # forward
                with torch.no_grad():
                    pd_dict = model(data_batch)

                # update losses and metrics
                loss_dict = model.compute_losses(pd_dict, data_batch)
                total_loss = sum(loss_dict.values())

                # update metrics and meters
                val_meters.update(loss=total_loss, **loss_dict)
                model.update_metrics(pd_dict, data_batch, val_meters.metrics)

                batch_time = time.time() - tic
                val_meters.update(time=batch_time, data=data_time)
                tic = time.time()

                if cfg.VAL.LOG_PERIOD > 0 and iteration_val % cfg.VAL.LOG_PERIOD == 0:
                    logger.info(
                        val_meters.delimiter.join([
                            'iter: {iter:4d}',
                            '{meters}',
                            'max mem: {memory:.0f}',
                        ]).format(
                            iter=iteration,
                            meters=str(val_meters),
                            memory=torch.cuda.max_memory_allocated() /
                            (1024.0**2),
                        ))

            # END: validation loop
            epoch_time_val = time.time() - start_time_val
            logger.info('Iteration[{}]-Val {}  total_time: {:.2f}s'.format(
                cur_iter, val_meters.summary_str, epoch_time_val))

            # summary
            if summary_writer is not None:
                keywords = ('loss', 'acc', 'ap', 'recall')
                for name, metric in val_meters.metrics.items():
                    if all(k not in name for k in keywords):
                        continue
                    summary_writer.add_scalar('val/' + name,
                                              metric.result,
                                              global_step=cur_iter)

            # best validation
            if cfg.VAL.METRIC in val_meters.metrics:
                cur_metric = val_meters.metrics[cfg.VAL.METRIC].result
                if best_metric is None \
                        or (cfg.VAL.METRIC_ASCEND and cur_metric > best_metric) \
                        or (not cfg.VAL.METRIC_ASCEND and cur_metric < best_metric):
                    best_metric = cur_metric
                    checkpoint_data['iteration'] = cur_iter
                    checkpoint_data[best_metric_name] = best_metric
                    checkpointer.save('model_best',
                                      tag=False,
                                      **checkpoint_data)

            # restore training
            setup_train()

        # ---------------------------------------------------------------------------- #
        # After validation
        # ---------------------------------------------------------------------------- #
        # checkpoint
        if (ckpt_period > 0
                and cur_iter % ckpt_period == 0) or cur_iter == max_iter:
            checkpoint_data['iteration'] = cur_iter
            if do_validation and best_metric is not None:
                checkpoint_data[best_metric_name] = best_metric
            checkpointer.save('model_{:06d}'.format(cur_iter),
                              **checkpoint_data)

        # ---------------------------------------------------------------------------- #
        # Finalize one step
        # ---------------------------------------------------------------------------- #
        # since pytorch v1.1.0, lr_scheduler is called after optimization.
        if lr_scheduler is not None:
            lr_scheduler.step()
        tic = time.time()

    # END: training loop
    if do_validation and cfg.VAL.METRIC:
        logger.info('Best val-{} = {}'.format(cfg.VAL.METRIC, best_metric))
def main():
    # ---------------------------------------------------------------------------- #
    # Setup the experiment
    # ---------------------------------------------------------------------------- #
    args = parse_args()

    # load the configuration
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    purge_cfg(cfg)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    # replace '@' with config path
    if output_dir:
        config_path = osp.splitext(args.config_file)[0]
        output_dir = output_dir.replace(
            '@', config_path.replace('configs', 'outputs'))
        if not osp.isdir(output_dir):
            warnings.warn('Make a new directory: {}'.format(output_dir))
            os.makedirs(output_dir)

    # run name
    timestamp = time.strftime('%m-%d_%H-%M-%S')
    hostname = socket.gethostname()
    run_name = '{:s}.{:s}'.format(timestamp, hostname)

    logger = setup_logger('predict',
                          '',
                          filename=f'log.predict.{run_name}.txt')
    logger.info('{:d} GPUs available'.format(torch.cuda.device_count()))
    logger.info(args)

    from common.utils.collect_env import collect_env_info
    logger.info('Collecting env info (might take some time)\n' +
                collect_env_info())

    logger.info('Loaded configuration file {:s}'.format(args.config_file))
    logger.info('Running with config:\n{}'.format(cfg))

    # ---------------------------------------------------------------------------- #
    # Setup the model and the dataset
    # ---------------------------------------------------------------------------- #
    batch_size = args.batch_size
    det_thresh = args.det_thresh
    vis_first_n = args.vis_first_n
    vis_thresh = args.vis_thresh

    # build model
    model = build_model(cfg)
    logger.info('Build model:\n{}'.format(str(model)))
    model = model.cuda()
    model.eval()

    # build checkpointer
    checkpointer = CheckpointerV2(model, save_dir=output_dir, logger=logger)

    if args.ckpt_path:
        # load weight if specified
        weight_path = args.ckpt_path.replace('@', output_dir)
        checkpointer.load(weight_path, resume=False)
    else:
        # load last checkpoint
        checkpointer.load(None, resume=True)

    # build dataset
    dataset_kwargs = dict(cfg.DATASET.VAL)
    dataset_kwargs.pop('start', None)
    dataset_kwargs.pop('end', None)  # generate proposals for the whole dataset
    if cfg.DATASET.NAME == 'FallingDigit':
        from space.datasets.falling_digit import FallingDigit
        if args.data_path is not None:
            dataset_kwargs['path'] = args.data_path
        dataset = FallingDigit(to_tensor=True, **dataset_kwargs)
    else:
        raise ValueError('Unsupported dataset: {}.'.format(cfg.DATASET.NAME))
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=1,
    )

    predictions = []
    if vis_first_n is not None:
        vis_dir = osp.join(output_dir, 'vis')
        os.makedirs(vis_dir, exist_ok=True)
    else:
        vis_dir = None

    # ---------------------------------------------------------------------------- #
    # Inference
    # ---------------------------------------------------------------------------- #
    for batch_idx, data_batch in enumerate(dataloader):
        # copy data from cpu to gpu
        data_batch = {
            k: v.cuda(non_blocking=True)
            for k, v in data_batch.items()
        }

        # forward
        with torch.no_grad():
            preds = model(data_batch, fast=True)

        boxes = preds['boxes'].cpu().numpy()  # (b, A * h1 * w1, 4)
        z_pres_p = preds['z_pres_p'].cpu().numpy()  # (b, A * h1 * w1)

        for sample_idx, boxes_per_image in enumerate(boxes):
            boxes_per_image = boxes_per_image.reshape(-1, 4)
            scores_per_image = z_pres_p[sample_idx]
            predictions_per_image = {}

            if det_thresh is not None:
                valid_mask = scores_per_image >= det_thresh
            else:
                valid_mask = np.ones_like(scores_per_image, dtype=bool)

            predictions_per_image['boxes'] = boxes_per_image[valid_mask]
            predictions_per_image['scores'] = scores_per_image[valid_mask]
            predictions.append(predictions_per_image)

            data_index = batch_idx * batch_size + sample_idx
            if vis_first_n is not None and (vis_first_n == -1
                                            or data_index < vis_first_n):
                data = dataset.data[data_index]
                if 'image' in data:
                    image = data['image']
                else:
                    image = data['original_image']

                vis_path = osp.join(vis_dir, '{:06d}.png'.format(data_index))
                # vis_path = None
                vis_mask = scores_per_image >= vis_thresh

                plot_results(image,
                             boxes_per_image[vis_mask],
                             labels=[
                                 '{:.2f}'.format(x)
                                 for x in scores_per_image[vis_mask]
                             ],
                             save_path=vis_path)

        if args.log_period > 0 and batch_idx % args.log_period == 0:
            print(batch_idx, '/', len(dataloader))

    # save
    output_filename = args.output_filename
    if output_filename is None:
        output_filename = 'proposals_' + osp.basename(dataset.path)
    with open(osp.join(args.output_dir, output_filename), 'wb') as f:
        pickle.dump(predictions, f, protocol=pickle.HIGHEST_PROTOCOL)