Exemplo n.º 1
0
def collect_results(result_part, size, tmpdir=None):
    results_out = {}
    for k in result_part[0].keys():
        results_out[k] = np.concatenate(
            [batch[k].numpy() for batch in result_part], axis=0)
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN, ),
                                32,
                                dtype=torch.uint8,
                                device='cuda')
        if rank == 0:
            tmpdir = tempfile.mkdtemp()
            tmpdir = torch.tensor(bytearray(tmpdir.encode()),
                                  dtype=torch.uint8,
                                  device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        commons.mkdir_or_exist(tmpdir)
    # dump the part result to the dir
    commons.dump(results_out, os.path.join(tmpdir, 'part_{}.pkl'.format(rank)))
    dist.barrier()
    # collect all parts
    if rank != 0:
        return None
    else:
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = os.path.join(tmpdir, 'part_{}.pkl'.format(i))
            part_list.append(commons.load(part_file))
        # sort the results
        ordered_results = defaultdict(list)
        out_dict = defaultdict(list)
        for res in part_list:
            for k in part_list[0].keys():
                out_dict[k].append(res[k])

        for k in part_list[0].keys():
            for res in zip(*(out_dict[k])):
                ordered_results[k].extend(list(res))
        # the dataloader may pad some samples
            ordered_results[k] = ordered_results[k][:size]
        # remove tmp dir
        shutil.rmtree(tmpdir)
        return ordered_results
    def evaluate(self, results, logger=None,epoch=None, out_path=None,**kwargs):
        commons.mkdir_or_exist(out_path)
        tmp={"results":results,"index":self.q_id_list}
        pickle.dump(tmp, open(os.path.join(out_path, "outputs_{}.pkl".format(epoch)),'wb'),protocol=4)
        ids = results["ids"]
        preds = results["pred"]

        out_list=[]

        for id,pred in zip(ids,preds):
            q_id = self.q_id_list[int(id)]
            pred_index = np.argmax(pred, axis=0)
            answer = self.label2ans[pred_index]
            out_list.append({'question_id': q_id, 'answer': answer})

        commons.dump(out_list, os.path.join(out_path, "test_submit_{0}.json".format(str(epoch))))
        if logger is not None and logger !="silent":
            print_log("testing finished {} epoch".format(epoch),logger=logger)
Exemplo n.º 3
0
def collect_results_cpu(result_part, size, tmpdir=None):
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN, ),
                                32,
                                dtype=torch.uint8,
                                device='cuda')
        if rank == 0:
            tmpdir = tempfile.mkdtemp()
            tmpdir = torch.tensor(bytearray(tmpdir.encode()),
                                  dtype=torch.uint8,
                                  device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        commons.mkdir_or_exist(tmpdir)
    # dump the part result to the dir
    commons.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl'))
    dist.barrier()
    # collect all parts
    if rank != 0:
        return None
    else:
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = os.join(tmpdir, f'part_{i}.pkl')
            part_list.append(commons.load(part_file))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        # remove tmp dir
        shutil.rmtree(tmpdir)
        return ordered_results
Exemplo n.º 4
0
def save_checkpoint(model, filename, optimizer=None, amp=None, meta=None):
    """Save checkpoint to file.

    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
    ``optimizer``. By default ``meta`` will contain version and time info.

    Args:
        model (Module): Module whose params are to be saved.
        filename (str): Checkpoint filename.
        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
        meta (dict, optional): Metadata to be saved in checkpoint.
    """
    if meta is None:
        meta = {}
    elif not isinstance(meta, dict):
        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
    meta.update(torch_version=torch.__version__, time=time.asctime())

    commons.mkdir_or_exist(osp.dirname(filename))
    if is_module_wrapper(model):
        model = model.module

    checkpoint = {
        'meta': meta,
        'state_dict': weights_to_cpu(get_state_dict(model))
    }
    # save optimizer state dict in the checkpoint
    if isinstance(optimizer, Optimizer):
        checkpoint['optimizer'] = optimizer.state_dict()
    elif isinstance(optimizer, dict):
        checkpoint['optimizer'] = {}
        for name, optim in optimizer.items():
            checkpoint['optimizer'][name] = optim.state_dict()
    if amp is not None:
        checkpoint['amp'] = amp
    # immediately flush buffer
    with open(filename, 'wb') as f:
        torch.save(checkpoint, f)
        f.flush()
Exemplo n.º 5
0
def main():
    args = parse_args()
    cfg = commons.Config.fromfile(args.config)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # cfg.model.pretrained = None
    cfg.data.test.test_mode = True

    # init distributed env first, since depends on the dist info
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    # TODO: support multiple images per gpu (only minor changes are needed)
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(dataset,
                                   imgs_per_gpu=1,
                                   workers_per_gpu=cfg.data.workers_per_gpu,
                                   dist=distributed,
                                   shuffle=False)
    # build the model and load checkpoint
    model = build_model(cfg.model)
    check_item = args.checkpoint[0]
    checkpoint = load_checkpoint(model,
                                 os.path.join(
                                     cfg.work_dir,
                                     'epoch_' + str(check_item) + '.pth'),
                                 map_location='cpu')
    label2ans = dataset.label2ans

    gpu_id = dist.get_rank() % torch.cuda.device_count()
    torch.cuda.set_device(gpu_id)
    model = model.cuda()
    if cfg.fp_16.enable:
        model = amp.initialize(model,
                               opt_level=cfg.fp_16.opt_level,
                               loss_scale=cfg.fp_16.loss_scale,
                               max_loss_scale=cfg.fp_16.max_loss_scale)
        print('**** Initializing mixed precision done. ****')
    model = MMDistributedDataParallel(
        model,
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
    )
    outputs = multi_gpu_test(model, data_loader, args.tmpdir)

    rank, _ = get_dist_info()
    if rank == 0:
        output_path = os.path.join(cfg.work_dir, "test_results")
        commons.mkdir_or_exist(output_path)
        out_list = []
        pickle.dump(outputs, open("outputs.pkl", 'wb'))

        ids = outputs["ids"]
        preds = outputs["pred"]

        for id, pred in zip(ids, preds):
            q_id = dataset.q_id_list[int(id)]
            pred_index = np.argmax(pred, axis=0)
            answer = dataset.label2ans[pred_index]
            out_list.append({'question_id': q_id, 'answer': answer})

        print('\nwriting results to {}'.format(output_path))
        commons.dump(
            out_list,
            os.path.join(output_path,
                         "test_submit_{0}.json".format(str(check_item))))
        os.system("rm -rf outputs.pkl")
Exemplo n.º 6
0
def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    # update configs according to CLI args
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    cfg.gpus = args.gpus

    # check memcached package exists
    if importlib.util.find_spec('mc') is None:
        traverse_replace(cfg, 'memcached', False)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # create work_dir
    commons.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, 'train_{}.log'.format(timestamp))
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([('{}: {}'.format(k, v))
                          for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info

    # log some basic info
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('Config:\n{}'.format(cfg.text))

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}, deterministic: {}'.format(
            args.seed, args.deterministic))
        set_random_seed(args.seed, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed

    if args.pretrained is not None:
        assert isinstance(args.pretrained, str)
        cfg.model.pretrained = args.pretrained
    model = build_model(cfg.model)

    datasets = [build_dataset(cfg.data.train)]
    assert len(cfg.workflow) == 1, "Validation is called by hook."
    if cfg.checkpoint_config is not None:
        # save VLB version, config file content and class names in
        # checkpoints as meta data
        cfg.checkpoint_config.meta = dict(VLB_version=__version__,
                                          config=cfg.text)
    # add an attribute for visualization convenience
    train_model(model,
                datasets,
                cfg,
                distributed=distributed,
                timestamp=timestamp,
                meta=meta)
Exemplo n.º 7
0
    def __init__(self,
                 model,
                 batch_processor=None,
                 optimizer=None,
                 work_dir=None,
                 resume_from=None,
                 fp_16=None,
                 logger=None,
                 meta=None):
        if batch_processor is not None:
            if not callable(batch_processor):
                raise TypeError('batch_processor must be callable, '
                                f'but got {type(batch_processor)}')
            warnings.warn('batch_processor is deprecated, please implement '
                          'train_step() and val_step() in the model instead.')
            # raise an error is `batch_processor` is not None and
            # `model.train_step()` exists.
            if is_module_wrapper(model):
                _model = model.module
            else:
                _model = model
            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
                raise RuntimeError(
                    'batch_processor and model.train_step()/model.val_step() '
                    'cannot be both available.')
        else:
            assert hasattr(model, 'train_step')

        # check the type of `optimizer`
        if isinstance(optimizer, dict):
            for name, optim in optimizer.items():
                if not isinstance(optim, Optimizer):
                    raise TypeError(
                        f'optimizer must be a dict of torch.optim.Optimizers, '
                        f'but optimizer["{name}"] is a {type(optim)}')
        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
            raise TypeError(
                f'optimizer must be a torch.optim.Optimizer object '
                f'or dict or None, but got {type(optimizer)}')

        # check the type of `logger`
        if not isinstance(logger, logging.Logger):
            raise TypeError(f'logger must be a logging.Logger object, '
                            f'but got {type(logger)}')

        # check the type of `meta`
        if meta is not None and not isinstance(meta, dict):
            raise TypeError(
                f'meta must be a dict or None, but got {type(meta)}')

        self.model = model
        self.batch_processor = batch_processor
        self.optimizer = optimizer
        self.logger = logger
        self.meta = meta

        # create work_dir
        if commons.is_str(work_dir):
            self.work_dir = osp.abspath(work_dir)
            commons.mkdir_or_exist(self.work_dir)
        elif work_dir is None:
            self.work_dir = None
        else:
            raise TypeError('"work_dir" must be a str or None')

        if commons.is_str(resume_from):
            self.resume_from = osp.abspath(resume_from)
        elif resume_from is None:
            self.resume_from = None

        # get model name from the model class
        if hasattr(self.model, 'module'):
            self._model_name = self.model.module.__class__.__name__
        else:
            self._model_name = self.model.__class__.__name__

        self._rank, self._world_size = get_dist_info()
        self.timestamp = get_time_str()
        self.mode = None
        self._hooks = []
        self._epoch = 0
        self._iter = 0
        self._inner_iter = 0
        self._max_epochs = 0
        self._max_iters = 0
        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
        self.log_buffer = LogBuffer()
        self.fp_16 = fp_16