Пример #1
0
def make_data_loader(cfg, stage='train', is_distributed=False, start_iter=0):
    assert stage in ['train', 'val', 'test']
    num_gpus = get_world_size()
    is_train = (stage == 'train')

    if is_train:
        pcs_per_gpu = cfg.SOLVER.PCS_PER_GPU_TRAIN
        shuffle = True
        drop_last = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        pcs_per_gpu = cfg.SOLVER.PCS_PER_GPU_VAL
        shuffle = False
        drop_last = False
        num_iters = None
        start_iter = 0

    paths_catalog = import_file('config.paths_catalog',
                                'config/path_catalog.py', True)

    DatasetCatalog = paths_catalog.DatasetCatalog
    dataset_name = cfg.DATASETS[stage.upper()]
    num_workers = cfg.DATALOADER.NUM_WORKERS

    dataset = build_dataset(cfg, dataset_name, DatasetCatalog, stage)
    sampler = make_data_sampler(dataset, shuffle, is_distributed)
    batch_sampler = make_batch_data_sampler(sampler, pcs_per_gpu, num_iters,
                                            start_iter, drop_last)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              num_workers=num_workers,
                                              batch_sampler=batch_sampler,
                                              collate_fn=dataset.collate_fn,
                                              pin_memory=True)
    return data_loader
Пример #2
0
    def fcos_losses(self, instances):
        num_classes = instances.logits_pred.size(1)
        assert num_classes == self.num_classes

        labels = instances.labels.flatten()

        pos_inds = torch.nonzero(labels != num_classes).squeeze(1)
        num_pos_local = pos_inds.numel()
        num_gpus = get_world_size()
        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
        num_pos_avg = max(total_num_pos / num_gpus, 1.0)

        # prepare one_hot
        class_target = torch.zeros_like(instances.logits_pred)
        class_target[pos_inds, labels[pos_inds]] = 1

        class_loss = sigmoid_focal_loss_jit(
            instances.logits_pred,
            class_target,
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        ) / num_pos_avg

        instances = instances[pos_inds]
        instances.pos_inds = pos_inds

        ctrness_targets = compute_ctrness_targets(instances.reg_targets)
        ctrness_targets_sum = ctrness_targets.sum()
        loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
        instances.gt_ctrs = ctrness_targets

        if pos_inds.numel() > 0:
            reg_loss = self.loc_loss_func(
                instances.reg_pred,
                instances.reg_targets,
                ctrness_targets
            ) / loss_denorm

            ctrness_loss = F.binary_cross_entropy_with_logits(
                instances.ctrness_pred,
                ctrness_targets,
                reduction="sum"
            ) / num_pos_avg
        else:
            reg_loss = instances.reg_pred.sum() * 0
            ctrness_loss = instances.ctrness_pred.sum() * 0

        losses = {
            "loss_fcos_cls": class_loss,
            "loss_fcos_loc": reg_loss,
            "loss_fcos_ctr": ctrness_loss
        }
        extras = {
            "instances": instances,
            "loss_denorm": loss_denorm
        }
        return extras, losses
Пример #3
0
def make_data_loader(cfg,
                     is_train=True,
                     is_distributed=False,
                     start_iter=0,
                     is_for_period=False):
    num_gpus = get_world_size()
    if is_train:
        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
        assert (
            images_per_batch % num_gpus == 0
        ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
            images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        images_per_batch = cfg.TEST.IMS_PER_BATCH
        assert (
            images_per_batch % num_gpus == 0
        ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
            images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        num_iters = None
        start_iter = 0
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
    paths_catalog = import_file("configs.paths_catalog", cfg.PATHS_CATALOG,
                                True)
    dataset_catalog = paths_catalog.DatasetCatalog
    dataset_list = cfg.DATASET.TRAIN if is_train else cfg.DATASET.TEST

    transforms = build_transforms(cfg, is_train)
    datasets = build_dataset(dataset_list, cfg.DATASET.DATA_TYPE, transforms,
                             dataset_catalog, is_train or is_for_period)

    data_loaders = []
    for dataset in datasets:
        sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(dataset, sampler,
                                                aspect_grouping,
                                                images_per_gpu, num_iters,
                                                start_iter)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=BatchCollator(),
        )
        data_loaders.append(data_loader)

    if is_train or is_for_period:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]
    return data_loaders
Пример #4
0
    def forward(self, input):
        if comm.get_world_size() == 1 or not self.training:
            return super().forward(input)

        B, C = input.shape[0], input.shape[1]

        mean = torch.mean(input, dim=[0, 2, 3])
        meansqr = torch.mean(input * input, dim=[0, 2, 3])

        if self._stats_mode == "":
            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
            vec = torch.cat([mean, meansqr], dim=0)
            vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
            mean, meansqr = torch.split(vec, C)
            momentum = self.momentum
        else:
            if B == 0:
                vec = torch.zeros([2 * C + 1],
                                  device=mean.device,
                                  dtype=mean.dtype)
                vec = vec + input.sum(
                )  # make sure there is gradient w.r.t input
            else:
                vec = torch.cat([
                    mean, meansqr,
                    torch.ones([1], device=mean.device, dtype=mean.dtype)
                ],
                                dim=0)
            vec = AllReduce.apply(vec * B)

            total_batch = vec[-1].detach()
            momentum = total_batch.clamp(
                max=1) * self.momentum  # no update if total_batch is 0
            total_batch = torch.max(
                total_batch, torch.ones_like(total_batch))  # avoid div-by-zero
            mean, meansqr, _ = torch.split(vec / total_batch, C)

        var = meansqr - mean * mean
        invstd = torch.rsqrt(var + self.eps)
        scale = self.weight * invstd
        bias = self.bias - mean * scale
        scale = scale.reshape(1, -1, 1, 1)
        bias = bias.reshape(1, -1, 1, 1)

        self.running_mean += momentum * (mean.detach() - self.running_mean)
        self.running_var += momentum * (var.detach() - self.running_var)
        return input * scale + bias
Пример #5
0
def make_epoch_data_loader(cfg, is_train=True,
    drop_last=True, is_distributed=False, start_iter=0):
    datasets = build_dataset(cfg, is_train)
    num_gpus = get_world_size()
    images_per_batch = cfg.DATALOADER.BSZ
    assert (
        images_per_batch % num_gpus == 0
    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
    "of GPUs ({}) used.".format(images_per_batch, num_gpus)
    images_per_gpu = images_per_batch // num_gpus
    logger = logging.getLogger(__name__)
    logger.info("Experiment with {} images per GPU".format(images_per_gpu))

    if is_train:
        shuffle = True
    else:
        shuffle = False if not is_distributed else True

    data_loaders = []
    for i, dataset in enumerate(datasets):
        sampler = make_data_sampler(dataset, shuffle, is_distributed, is_train, cfg)
        # default collator works!
        num_workers = cfg.DATALOADER.WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            sampler=sampler,
            batch_size=images_per_gpu,
            drop_last=drop_last,
            pin_memory=True,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]

    return data_loaders
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k in sorted(loss_dict.keys()):
            loss_names.append(k)
            all_losses.append(loss_dict[k])
        all_losses = torch.stack(all_losses, dim=0)
        dist.reduce(all_losses, dst=0)
        if dist.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
Пример #7
0
def inference(
        model,
        data_loader,
        dataset_name,
        mem_active=False,
        output_folder=None,
):
    # convert to a torch.device for efficiency
    device = torch.device("cuda")
    num_devices = get_world_size()
    logger = logging.getLogger("AlphAction.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} videos).".format(dataset_name, len(dataset)))
    start_time = time.time()
    predictions = compute_on_dataset(model, data_loader, device, logger, mem_active)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Total inference time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    if output_folder:
        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))

    return evaluate(
        dataset=dataset,
        predictions=predictions,
        output_folder=output_folder,
    )
Пример #8
0
def make_train_data_loader(datasets, is_distributed=False, start_iter=0):
    num_gpus = get_world_size()
    ims_per_gpu = int(cfg.TRAIN.BATCH_SIZE / num_gpus)
    shuffle = True
    num_iters = cfg.SOLVER.MAX_ITER

    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    sampler = make_data_sampler(datasets, shuffle, is_distributed)
    batch_sampler = make_batch_data_sampler(datasets, sampler, aspect_grouping,
                                            ims_per_gpu, num_iters, start_iter)
    collator = BatchCollator(cfg.TRAIN.SIZE_DIVISIBILITY)
    num_workers = cfg.TRAIN.LOADER_THREADS
    data_loader = torch.utils.data.DataLoader(
        datasets,
        num_workers=num_workers,
        batch_sampler=batch_sampler,
        collate_fn=collator,
    )

    return data_loader
Пример #9
0
def compute_on_dataset_1stage(model, data_loader, device):
    # single stage inference, for model without memory features
    cpu_device = torch.device("cpu")
    results_dict = {}
    if get_world_size() == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))
    for batch in tqdm(data_loader, **extra_args):
        slow_clips, fast_clips, boxes, objects, extras, video_ids = batch
        slow_clips = slow_clips.to(device)
        fast_clips = fast_clips.to(device)
        boxes = [box.to(device) for box in boxes]
        objects = [None if (box is None) else box.to(device) for box in objects]

        with torch.no_grad():
            output = model(slow_clips, fast_clips, boxes, objects, extras)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {video_id: result for video_id, result in zip(video_ids, output)}
        )

    return results_dict
Пример #10
0
def inference(model, criterion, data_loader, dataset_name, save_result=False):
    logger = logging.getLogger('eve.' + __name__)

    device = torch.device('cuda')
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset ({} point clouds).".format(
        dataset_name, len(dataset)))

    if get_world_size() == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    start_time = time.time()

    model.eval()
    outputs_per_gpu = {}
    targets_per_gpu = {}
    file_path_per_gpu = {}

    times = []

    with torch.no_grad():
        for batch in tqdm(data_loader, **extra_args):
            locs, feats, targets, metadata = batch
            inputs = ME.SparseTensor(feats, coords=locs).to(device)
            targets = targets.to(device, non_blocking=True).long()

            torch.cuda.synchronize()
            start_time = time.time()
            outputs = model(inputs, y=targets)
            torch.cuda.synchronize()
            end_time = time.time()
            times.append(end_time - start_time)

            arch = cfg.MODEL.ARCHITECTURE
            if arch == 'minkunet4d' or arch == 'minkunet_eve':
                for batch_idx in range(len(metadata)):
                    for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES):
                        inv_map = metadata[batch_idx][time_idx]['inverse_map']
                        file_path = metadata[batch_idx][time_idx]['file_path']

                        locs_frame = (locs[:, -1] == batch_idx) & \
                            (locs[:, -2] == time_idx)
                        one_output, one_target = compute_one_frame(
                            outputs, targets, locs_frame, inv_map)

                        outputs_per_gpu[file_path] = one_output
                        targets_per_gpu[file_path] = one_target
                        file_path_per_gpu[file_path] = file_path
            else:  # other minknet
                for batch_idx in range(len(metadata)):
                    inv_map = metadata[batch_idx]['inverse_map']
                    file_path = metadata[batch_idx]['file_path']

                    # From MinkowskiEngine v0.3, batch index is on the first column
                    locs_frame = locs[:, -1] == batch_idx
                    one_output, one_target = compute_one_frame(
                        outputs, targets, locs_frame, inv_map)

                    outputs_per_gpu[file_path] = one_output
                    targets_per_gpu[file_path] = one_target
                    file_path_per_gpu[file_path] = file_path

    synchronize()

    logger.info("Total inference time: {}".format(np.sum(times)))

    # NOTE: `all_gather` will lead to CUDA out of memory
    # We use `scatter_gather` to save result of each process
    # in LOGS.DIR/tmp and will be cleared after gathering.
    outputs = scatter_gather(outputs_per_gpu)
    targets = scatter_gather(targets_per_gpu)
    file_paths = scatter_gather(file_path_per_gpu)
    if not is_main_process():
        return None

    all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()}
    all_targets = {k: v.numpy() for t in targets for k, v in t.items()}
    all_file_paths = {k: v for f in file_paths for k, v in f.items()}

    assert len(all_outputs) == len(dataset.all_files), \
        '%d vs %d' % (len(all_outputs), len(dataset.all_files))

    if cfg.LOGS.SAVE_RESULT is False:
        all_file_paths = None
    metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths)

    return metrics
Пример #11
0
def compute_on_dataset_2stage(model, data_loader, device, logger):
    # two stage inference, for model with memory features.
    # first extract features and then do the inference
    cpu_device = torch.device("cpu")
    num_devices = get_world_size()
    dataset = data_loader.dataset
    if num_devices == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    loader_len = len(data_loader)
    person_feature_pool = MemoryPool()
    batch_info_list = [None]*loader_len
    logger.info("Stage 1: extracting clip features.")
    start_time = time.time()

    for i, batch in enumerate(tqdm(data_loader, **extra_args)):
        slow_clips, fast_clips, boxes, objects, extras, video_ids = batch
        slow_clips = slow_clips.to(device)
        fast_clips = fast_clips.to(device)
        boxes = [box.to(device) for box in boxes]
        objects = [None if (box is None) else box.to(device) for box in objects]
        movie_ids = [e["movie_id"] for e in extras]
        timestamps = [e["timestamp"] for e in extras]
        with torch.no_grad():
            feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0)
            person_feature = [ft.to(cpu_device) for ft in feature[0]]
            object_feature = [ft.to(cpu_device) for ft in feature[1]]
        # store person features into memory pool
        for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature):
            person_feature_pool[movie_id, timestamp] = p_ft
        # store other information in list, for further inference
        batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature)

    # gather feature pools from different ranks
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 1 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )
    feature_pool = all_gather(person_feature_pool)
    all_feature_pool_p = MemoryPool()
    all_feature_pool_p.update_list(feature_pool)
    del feature_pool, person_feature_pool

    # do the inference
    results_dict = {}
    logger.info("Stage 2: predicting with extracted feature.")
    start_time = time.time()
    for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args):
        current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device)
                          for movie_id, timestamp in zip(movie_ids, timestamps)]
        current_feat_o = [ft_o.to(device) for ft_o in object_feature]
        extras = dict(
            person_pool=all_feature_pool_p,
            movie_ids=movie_ids,
            timestamps=timestamps,
            current_feat_p=current_feat_p,
            current_feat_o=current_feat_o,
        )
        with torch.no_grad():
            output = model(None, None, None, None, extras=extras, part_forward=1)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {video_id: result for video_id, result in zip(video_ids, output)}
        )
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 2 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    return results_dict
Пример #12
0
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
    num_gpus = get_world_size()
    if is_train:
        # for training
        videos_per_batch = cfg.SOLVER.VIDEOS_PER_BATCH
        assert (
            videos_per_batch % num_gpus == 0
        ), "SOLVER.VIDEOS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
        videos_per_gpu = videos_per_batch // num_gpus
        shuffle = True
        drop_last = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        # for testing
        videos_per_batch = cfg.TEST.VIDEOS_PER_BATCH
        assert (
            videos_per_batch % num_gpus == 0
        ), "TEST.VIDEOS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
        videos_per_gpu = videos_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        drop_last = False
        num_iters = None
        start_iter = 0

    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    DatasetCatalog = paths_catalog.DatasetCatalog
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST

    # build dataset
    transforms = build_transforms(cfg, is_train)
    if has_object(cfg.IA_STRUCTURE):
        object_transforms = build_object_transforms(cfg, is_train=is_train)
    else:
        object_transforms = None
    datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog,
                             is_train, object_transforms)

    # build sampler and dataloader
    data_loaders = []
    for dataset in datasets:
        sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(dataset, sampler,
                                                aspect_grouping,
                                                videos_per_gpu, num_iters,
                                                start_iter, drop_last)
        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=collator,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]
    return data_loaders
Пример #13
0
 def __len__(self):
     return max(len(self.examples), get_world_size())
def do_infer(
    model,
    data_loader,
    dataset_name,
    device="cuda",
    output_folder=None,
):
    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = get_world_size()
    logger = logging.getLogger("EfficientDet.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(
        dataset_name, len(dataset)))
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
    predictions = compute_on_dataset(model, data_loader, device,
                                     inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
    total_time_str = get_time_str(total_time)
    logger.info(
        "Total run time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset),
            num_devices))
    total_infer_time = get_time_str(inference_timer.total_time)
    logger.info(
        "Model inference time: {} ({} s / img per device, on {} devices)".
        format(
            total_infer_time,
            inference_timer.total_time * num_devices / len(dataset),
            num_devices,
        ))

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    coco_results = []
    image_ids = []
    for image_id, prediction in enumerate(predictions):
        original_id = dataset.image_ids[image_id]
        image_ids.append(original_id)
        coco_results.extend([{
            "image_id":
            original_id,
            "category_id":
            dataset.return_coco_label(e['class']),
            "bbox":
            e['bbox'],
            "score":
            e['score']
        } for e in prediction])

    map_05_09 = 0
    with tempfile.NamedTemporaryFile() as f:
        file_path = f.name
        output_folder = './'
        if output_folder:
            file_path = os.path.join(output_folder, 'bbox_results.json')
        with open(file_path, "w") as w_obj:
            json.dump(coco_results, w_obj)

        # load results in COCO evaluation tool
        coco_true = dataset.coco
        coco_pred = coco_true.loadRes(file_path)

        # run COCO evaluation
        coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
        coco_eval.params.imgIds = image_ids
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

        map_05_09 = coco_eval.stats[0]
    return map_05_09
Пример #15
0
def make_data_loader(cfg,
                     is_train=True,
                     is_distributed=False,
                     start_iter=0,
                     partial_size=0):
    num_gpus = get_world_size()
    if is_train:
        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
        assert (images_per_batch % num_gpus == 0
                ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        images_per_batch = cfg.TEST.IMS_PER_BATCH
        assert (images_per_batch % num_gpus == 0
                ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        num_iters = None
        start_iter = 0

    if images_per_gpu > 1:
        logger = logging.getLogger(__name__)
        logger.warning(
            "When using more than one image per GPU you may encounter "
            "an out-of-memory (OOM) error if your GPU does not have "
            "sufficient memory. If this happens, you can reduce "
            "SOLVER.IMS_PER_BATCH (for training) or "
            "TEST.IMS_PER_BATCH (for inference). For training, you must "
            "also adjust the learning rate and schedule length according "
            "to the linear scaling rule. See for example: "
            "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
        )

    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog",
                                cfg.PATHS_CATALOG, True)
    DatasetCatalog = paths_catalog.DatasetCatalog
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST

    transforms = build_transforms(cfg, is_train)
    datasets = build_dataset(dataset_list, transforms, DatasetCatalog,
                             is_train)

    data_loaders = []
    for dataset in datasets:
        if partial_size:
            sampler = PartialSequentialSampler(dataset, partial_size)
        else:
            sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(dataset, sampler,
                                                aspect_grouping,
                                                images_per_gpu, num_iters,
                                                start_iter)
        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=collator,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]
    return data_loaders