Пример #1
0
 def test_fine_tuning_pipeline_ignores_bbox_if_present(self):
     preprocess = get_preprocessing_pipeline(train=True, fine_tuning=True)
     assert isinstance(preprocess.transforms[0], IgnoreBboxIfPresent)
     img, tensor = TestCustomAugmentations._generate_img(256)
     # Just make sure processing with bbox doesn't result in an error.
     preprocess((img, "bbox"))
     preprocess(img)
     preprocess((tensor, "bbox"))
     preprocess(tensor)
Пример #2
0
 def test_inference_pipeline(self):
     """
     Compare original validation pipeline against the optimised one.
     """
     img, tensor = TestCustomAugmentations._generate_img(256)
     ground_truth_preprocess = transforms.Compose([
         transforms.Resize(256),
         transforms.CenterCrop(224),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
     ])
     preprocess = get_preprocessing_pipeline(train=False, input_size=224, half_precision=False, normalize=True)
     result_img = preprocess(img)
     result_tensor = preprocess(tensor)
     ground_truth = ground_truth_preprocess(img)
     assert torch.allclose(result_img, ground_truth, atol=1e-06)  # reference vs custom(img)
     assert torch.allclose(result_img, result_tensor, atol=1e-06)  # custom(img) vs custom(tensor)
Пример #3
0
def test_preprocess_match(first_step, second_step):
    """
    The webdataset may contain augmentations. Check wether the duplicated preprocessing steps are removed from the pipeline.
    """
    first_step = [] if first_step == "" else [first_step]
    second_step = [] if second_step == "" else [second_step]
    done_transform = first_step + second_step
    pipeline = get_preprocessing_pipeline(train=False, input_size=224, half_precision=False, normalize=True)
    len_pipeline = len(pipeline.transforms)
    modified_pipeline = match_preprocess(pipeline, done_transform)
    if len(first_step) == 0 or first_step[0] != "Resize(256)":
        assert len_pipeline == len(modified_pipeline.transforms)
    else:
        if len(second_step) == 0:
            assert len_pipeline - 1 == len(modified_pipeline.transforms)
        else:
            assert len_pipeline - 2 == len(modified_pipeline.transforms)
Пример #4
0
def get_data(args, opts, train=True, async_dataloader=False, return_remaining=False, fine_tuning=False):
    """
    A factory method to create a dataload responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    logging.info("Loading the data")
    input_shape = models.model_input_shape(args, train)
    if args.precision[:3] == "16.":
        half_precision = True
    elif args.precision[:3] == "32.":
        half_precision = False
    use_bbox_info = getattr(args, "use_bbox_info", False)

    if args.data in ["real", "imagenet", "cifar10"]:
        transform = get_preprocessing_pipeline(train, input_shape[-1],
                                               half_precision, args.normalization_location == "host", eightbit = args.eight_bit_io,
                                               use_bbox_info=use_bbox_info, fine_tuning=fine_tuning)
    # Determine the size of the small datasets
    if hasattr(args, "iterations"):
        dataset_size = args.batch_size * \
                       opts.device_iterations * \
                       opts.replication_factor * \
                       opts.Training.gradient_accumulation * \
                       args.iterations

    # Select the right dataset
    if args.data in ["synthetic", "generated"]:
        if hasattr(args, "iterations"):
            dataset = GeneratedDataset(input_shape, size=dataset_size, half_precision=half_precision, eightbit=args.eight_bit_io)
        else:
            dataset = GeneratedDataset(input_shape, half_precision=half_precision, eightbit=args.eight_bit_io)
    elif args.data == "real":
        data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("images")
        if hasattr(args, "iterations"):
            dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size)
        else:
            dataset = SampleDataset(img_dir=data_path, transform=transform)
    elif args.data == "imagenet":
        assert os.path.exists(args.imagenet_data_path), f"{args.imagenet_data_path} does not exist!"
        if os.path.exists(os.path.join(args.imagenet_data_path, 'metadata.json')):
            # WebDataset format
            dataset = get_webdataset(args, opts, train, transform=transform, use_bbox_info=use_bbox_info)
        else:
            data_folder = 'train' if train else 'validation'
            data_folder = os.path.join(args.imagenet_data_path, data_folder)
            if os.path.exists(data_folder):
                # Original ImageNet format
                bboxes = os.path.join(args.imagenet_data_path, 'imagenet_2012_bounding_boxes.csv') if use_bbox_info and train else None   # use bboxes only for training
                dataset = ImageNetDataset(data_folder, transform=transform, bbox_file=bboxes)
            else:
                # TFRecord format
                dataset = get_tfrecord(args, opts, train, transform=transform, use_bbox_info=use_bbox_info)
    elif args.data == "cifar10":
        data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10")
        dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform)
    global_batch_size = args.batch_size * opts.device_iterations * opts.replication_factor * opts.Training.gradient_accumulation
    if async_dataloader:
        if global_batch_size == 1:
            # Avoid rebatch overhead
            mode = poptorch.DataLoaderMode.Async
        else:
            mode = poptorch.DataLoaderMode.AsyncRebatched
    else:
        mode = poptorch.DataLoaderMode.Sync
    worker_initialization = _WorkerInit(args.seed, opts.Distributed.processId, args.dataloader_worker) if hasattr(args, 'seed') else None
    rebatch_size = getattr(args, "dataloader_rebatch_size", None)
    rebatch_size = rebatch_size if rebatch_size is not None else min(1024, global_batch_size) // opts.Distributed.numProcesses
    # Make sure rebatch size is smaller than global batch size
    rebatch_size = min(rebatch_size, global_batch_size)
    dataloader = poptorch.DataLoader(opts,
                                     dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.dataloader_worker,
                                     shuffle=train and not(isinstance(dataset, torch.utils.data.IterableDataset)),
                                     drop_last= not(return_remaining) and not isinstance(dataset, torch.utils.data.IterableDataset),
                                     persistent_workers = True,
                                     auto_distributed_partitioning = not isinstance(dataset, torch.utils.data.IterableDataset),
                                     worker_init_fn=worker_initialization,
                                     mode=mode,
                                     rebatched_worker_size=rebatch_size,
                                     async_options={'load_indefinitely': True})
    if isinstance(dataset, torch.utils.data.IterableDataset):
        dataloader = DatasetRebatch(dataloader, global_batch_size, len(dataset), not(return_remaining))
    return dataloader
Пример #5
0
def get_data(configs, model_opts, train=True, async_dataloader=False):
    """
    A factory method to create a dataloader responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    if configs.precision.startswith("16."):
        half_precision = True
    elif configs.precision.startswith("32."):
        half_precision = False
    transform = get_preprocessing_pipeline(
        train, 224, half_precision, configs.normalization_location == "host")
    # Determine the size of the small datasets
    if hasattr(configs, "iterations"):
        dataset_size = configs.micro_batch_size * \
            model_opts.device_iterations * \
            model_opts.replication_factor * \
            model_opts.Training.gradient_accumulation * \
            configs.iterations

    rebatched_worker_size = None

    # Select the right dataset
    if configs.dataset in ["synthetic", "generated"]:
        if hasattr(configs, "iterations"):
            dataset = GeneratedDataset((3, 224, 224),
                                       size=dataset_size,
                                       half_precision=half_precision)
        else:
            dataset = GeneratedDataset((3, 224, 224),
                                       half_precision=half_precision)

    elif configs.dataset in ["imagenet1k", "imagenet21k"]:
        dataset = torchvision.datasets.ImageFolder(os.path.join(
            configs.dataset_path, "train" if train else "validation"),
                                                   transform=transform)
        if train:
            rebatched_worker_size = 128

    elif configs.dataset == "cifar10":
        dataset = torchvision.datasets.CIFAR10(root=configs.dataset_path,
                                               train=train,
                                               download=True,
                                               transform=transform)
        if train:
            rebatched_worker_size = 256
    else:
        raise Exception('Dataset type not recognized: %s' % configs.dataset)

    mode = poptorch.DataLoaderMode.AsyncRebatched if async_dataloader else poptorch.DataLoaderMode.Sync
    dataloader = poptorch.DataLoader(
        model_opts,
        dataset,
        batch_size=configs.micro_batch_size
        if not (isinstance(dataset, IterableDataset)) else None,
        num_workers=configs.dataloader_workers,
        shuffle=train and not (isinstance(dataset, IterableDataset)),
        drop_last=not (isinstance(dataset, IterableDataset)),
        persistent_workers=True,
        auto_distributed_partitioning=not isinstance(dataset, IterableDataset),
        worker_init_fn=None,
        mode=mode,
        rebatched_worker_size=rebatched_worker_size,
        async_options={
            'load_indefinitely': True,
            "buffer_size": 8
        })
    return dataloader
Пример #6
0
def get_data(opts, model_opts, train=True, async_dataloader=False):
    """
    A factory method to create a dataload responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    if opts.precision[:3] == "16.":
        half_precision = True
    elif opts.precision[:3] == "32.":
        half_precision = False
    transform = get_preprocessing_pipeline(
        train, models.available_models[opts.model]["input_shape"][-1],
        half_precision, opts.normalization_location == "host")
    # Determine the size of the small datasets
    if hasattr(opts, "iterations"):
        dataset_size = opts.batch_size * \
                       model_opts.device_iterations * \
                       model_opts.replication_factor * \
                       model_opts.Training.gradient_accumulation * \
                       opts.iterations

    # Select the right dataset
    if opts.data in ["synthetic", "generated"]:
        if hasattr(opts, "iterations"):
            dataset = GeneratedDataset(
                models.available_models[opts.model]["input_shape"],
                size=dataset_size,
                half_precision=half_precision)
        else:
            dataset = GeneratedDataset(
                models.available_models[opts.model]["input_shape"],
                half_precision=half_precision)
    elif opts.data == "real":
        data_path = Path(__file__).parent.parent.absolute().joinpath(
            "data").joinpath("images")
        if hasattr(opts, "iterations"):
            dataset = SampleDataset(img_dir=data_path,
                                    transform=transform,
                                    size=dataset_size)
        else:
            dataset = SampleDataset(img_dir=data_path, transform=transform)
    elif opts.data == "imagenet":
        if os.path.exists(
                os.path.join(opts.imagenet_data_path, 'metadata.json')):
            # WebDataset format
            dataset = get_webdataset(opts,
                                     model_opts,
                                     train,
                                     half_precision,
                                     transform=transform)
        else:
            # Original ImageNet format
            data_folder = 'train' if train else 'validation'
            dataset = torchvision.datasets.ImageFolder(os.path.join(
                opts.imagenet_data_path, data_folder),
                                                       transform=transform)
    elif opts.data == "cifar10":
        data_path = Path(__file__).parent.parent.absolute().joinpath(
            "data").joinpath("cifar10")
        dataset = torchvision.datasets.CIFAR10(root=data_path,
                                               train=train,
                                               download=True,
                                               transform=transform)
    mode = poptorch.DataLoaderMode.Async if async_dataloader and not isinstance(
        dataset,
        torch.utils.data.IterableDataset) else poptorch.DataLoaderMode.Sync
    dataloader = poptorch.DataLoader(
        model_opts,
        dataset,
        batch_size=opts.batch_size if
        not (isinstance(dataset, torch.utils.data.IterableDataset)) else None,
        num_workers=opts.dataloader_worker,
        shuffle=train
        and not (isinstance(dataset, torch.utils.data.IterableDataset)),
        drop_last=not (isinstance(dataset, torch.utils.data.IterableDataset)),
        persistent_workers=True,
        auto_distributed_partitioning=not isinstance(
            dataset, torch.utils.data.IterableDataset),
        worker_init_fn=None,
        mode=mode,
        async_options={'load_indefinitely': True})

    if isinstance(dataset, torch.utils.data.IterableDataset):
        global_batch_size = opts.batch_size * model_opts.device_iterations * model_opts.replication_factor * model_opts.Training.gradient_accumulation
        if async_dataloader:
            dataloader._accessor = poptorch.AsynchronousDataAccessor(
                DatasetRebatch(dataloader, global_batch_size),
                load_indefinitely=True)
        else:
            dataloader = DatasetRebatch(dataloader, global_batch_size)
    return dataloader