def load_dataset(self): """ Overrides base dataset loading Fixes path to data Fixes all tranformations to be identical Preprocessing from: https://github.com/pytorch/vision/issues/39 """ train_path = os.path.expanduser("~/nta/data/imagenet/train") val_path = os.path.expanduser("~/nta/data/imagenet/val") stats_mean, stats_std = datasets_stats["ImageNet"] train_transform = transforms.Compose( [ transforms.RandomSizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(stats_mean, stats_std), ] ) val_transform = transforms.Compose( [ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(stats_mean, stats_std), ] ) # load datasets train_dataset = CachedDatasetFolder( train_path, transform=train_transform, num_classes=self.num_classes ) test_dataset = CachedDatasetFolder( val_path, transform=val_transform, num_classes=self.num_classes ) # load dataloaders # added pin_memory=True for faster data recovery self.train_loader = DataLoader( train_dataset, shuffle=True, batch_size=self.batch_size_train, pin_memory=True, num_workers=56, ) self.test_loader = DataLoader( test_dataset, shuffle=False, batch_size=self.batch_size_test, pin_memory=True, num_workers=56, )
def create_validation_dataloader(data_dir, val_dir, batch_size, workers, num_classes=1000): """ Configure Imagenet validation dataloader Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder` or :class:`HDF5Dataset` pre-configured for the validation cycle. :param data_dir: The directory or hdf5 file containing the dataset :param val_dir: The directory containing or hdf5 group the validation data :param batch_size: Images per batch :param workers: how many data loading subprocesses to use :param num_classes: Limit the dataset size to the given number of classes :return: torch.utils.data.DataLoader """ transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ]) if h5py.is_hdf5(data_dir): if num_classes in IMAGENET_NUM_CLASSES: classes = IMAGENET_NUM_CLASSES[num_classes] dataset = HDF5Dataset(hdf5_file=data_dir, root=val_dir, classes=classes, transform=transform) else: dataset = HDF5Dataset(hdf5_file=data_dir, root=val_dir, num_classes=num_classes, transform=transform) else: dataset = CachedDatasetFolder(root=os.path.join(data_dir, val_dir), num_classes=num_classes, transform=transform) return torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=False, )
def create_validation_dataset(data_dir, val_dir, num_classes=1000): """ Configure Imagenet validation dataloader Creates :class:`CachedDatasetFolder` or :class:`HDF5Dataset` pre-configured for the validation cycle. :param data_dir: The directory or hdf5 file containing the dataset :param val_dir: The directory containing or hdf5 group the validation data :param num_classes: Limit the dataset size to the given number of classes :return: CachedDatasetFolder or HDF5Dataset """ transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ]) if h5py.is_hdf5(data_dir): if num_classes in IMAGENET_NUM_CLASSES: classes = IMAGENET_NUM_CLASSES[num_classes] dataset = HDF5Dataset(hdf5_file=data_dir, root=val_dir, classes=classes, transform=transform) else: dataset = HDF5Dataset(hdf5_file=data_dir, root=val_dir, num_classes=num_classes, transform=transform) else: dataset = CachedDatasetFolder(root=os.path.join(data_dir, val_dir), num_classes=num_classes, transform=transform) return dataset
transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(stats_mean, stats_std), ]) val_transform = transforms.Compose([ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(stats_mean, stats_std), ]) # load train dataset t0 = time() train_dataset = CachedDatasetFolder(train_path, transform=train_transform, num_classes=num_classes) print("Loaded train dataset") t1 = time() print("Time spent to load train dataset: {:.2f}".format(t1 - t0)) # load test dataset t0 = time() test_dataset = CachedDatasetFolder(val_path, transform=val_transform, num_classes=num_classes) print("Loaded test dataset") t1 = time() print("Time spent to load test dataset: {:.2f}".format(t1 - t0)) # load dataloaders
def create_train_dataloader( data_dir, train_dir, batch_size, workers, distributed, num_classes=1000, use_auto_augment=False, ): """ Configure Imagenet training dataloader Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder` or :class:`HDF5Dataset` pre-configured for the training cycle :param data_dir: The directory or hdf5 file containing the dataset :param train_dir: The directory or hdf5 group containing the training data :param batch_size: Images per batch :param workers: how many data loading subprocesses to use :param distributed: Whether or not to use `DistributedSampler` :param num_classes: Limit the dataset size to the given number of classes :return: torch.utils.data.DataLoader """ if use_auto_augment: transform = transforms.Compose(transforms=[ RandomResizedCrop(224), transforms.RandomHorizontalFlip(), ImageNetPolicy(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ], ) else: transform = transforms.Compose(transforms=[ RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ], ) if h5py.is_hdf5(data_dir): # Use fixed Imagenet classes if mapping is available if num_classes in IMAGENET_NUM_CLASSES: classes = IMAGENET_NUM_CLASSES[num_classes] dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, classes=classes, transform=transform) else: dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, num_classes=num_classes, transform=transform) else: dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir), num_classes=num_classes, transform=transform) if distributed: train_sampler = DistributedSampler(dataset) else: train_sampler = None return torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=train_sampler is None, num_workers=workers, sampler=train_sampler, pin_memory=torch.cuda.is_available(), )
def create_train_dataset(data_dir, train_dir, num_classes=1000, use_auto_augment=False, sample_transform=None, target_transform=None, replicas_per_sample=1): """ Configure Imagenet training dataset Creates :class:`CachedDatasetFolder` :class:`HDF5Dataset` pre-configured for the training cycle :param data_dir: The directory or hdf5 file containing the dataset :param train_dir: The directory or hdf5 group containing the training data :param num_classes: Limit the dataset size to the given number of classes :param sample_transform: List of transforms acting on the samples to be added to the defaults below :param target_transform: List of transforms acting on the targets :param replicas_per_sample: Number of replicas to create per sample in the batch (each replica is transformed independently). Used in maxup. :return: CachedDatasetFolder or HDF5Dataset """ if use_auto_augment: transform = transforms.Compose(transforms=[ RandomResizedCrop(224), transforms.RandomHorizontalFlip(), ImageNetPolicy(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ], ) else: transform = transforms.Compose(transforms=[ RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=True), ], ) transform = transforms.Compose(transforms=[transform] + (sample_transform or [])) if h5py.is_hdf5(data_dir): # Use fixed Imagenet classes if mapping is available if num_classes in IMAGENET_NUM_CLASSES: classes = IMAGENET_NUM_CLASSES[num_classes] dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, classes=classes, transform=transform, target_transform=target_transform, replicas_per_sample=replicas_per_sample) else: dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, num_classes=num_classes, transform=transform, target_transform=target_transform, replicas_per_sample=replicas_per_sample) else: dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir), num_classes=num_classes, transform=transform, target_transform=target_transform) return dataset
def _create_train_dataloader(data_dir, train_dir, batch_size, workers, distributed, progressive_resize, num_classes=1000): """ Configure Imagenet training dataloader Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder` or or :class:`HDF5Dataset` pre-configured for the training cycle with an optional :class:`ProgressiveRandomResizedCrop` schedule where the images sizes can vary at different epochs during the cycle. :param data_dir: The directory or hdf5 file containing the dataset :param train_dir: The directory or hdf5 group containing the training data :param batch_size: Images per batch :param workers: how many data loading subprocesses to use :param distributed: Whether or not to use `DistributedSampler` :param progressive_resize: Dictionary containing the progressive resize schedule :param num_classes: Limit the dataset size to the given number of classes :return: torch.utils.data.DataLoader """ if progressive_resize is None: # Standard size for all epochs resize_transform = RandomResizedCrop(224) else: # Convert progressive_resize dict from {str:int} to {int:int} progressive_resize = {int(k): v for k, v in progressive_resize.items()} resize_transform = ProgressiveRandomResizedCrop( progressive_resize=progressive_resize) transform = transforms.Compose(transforms=[ resize_transform, transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ], ) if h5py.is_hdf5(data_dir): if num_classes in IMAGENET_NUM_CLASSES: classes = IMAGENET_NUM_CLASSES[num_classes] dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, classes=classes, transform=transform) else: dataset = HDF5Dataset(hdf5_file=data_dir, root=train_dir, num_classes=num_classes, transform=transform) else: dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir), num_classes=num_classes, transform=transform) if distributed: train_sampler = DistributedSampler(dataset) else: train_sampler = None return torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=train_sampler is None, num_workers=workers, sampler=train_sampler, pin_memory=torch.cuda.is_available(), )