def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if kwargs["download"]: raise RuntimeError("Can't download ImageNet, sry") self.vanilla_train = ImageNet(self.dataset_dir, split="train") self.vanilla_val = ImageNet(self.dataset_dir, split="val")
def main(args): cfg = get_cfg() cfg.OUTPUT_DIR = args.output_dir num_workers = cfg.DATASETS.NUM_WORKER batch_size = cfg.SOLVER.BATCH_SIZE transforms = get_transforms() train_dl = DataLoader( # Coco2017Dataset(root=args.root), # CIFAR10(root=args.root, train=True, download=False, transform=transforms['train']), # NOQA ImageNet(root=args.root, split='train', transform=transforms['train']), batch_size=batch_size, num_workers=num_workers, shuffle=True, ) val_dl = DataLoader( # Coco2017Dataset(root=args.root, data_type='val'), # CIFAR10(root=args.root, train=False, download=False, transform=transforms['val']), # NOQA ImageNet(root=args.root, split='val', transform=transforms['val']), batch_size=batch_size, num_workers=num_workers, shuffle=False, ) assert val_dl optimizer = Optimizer( cfg=cfg, weight_path=args.weights, train_dl=train_dl, # val_dl=val_dl ) optimizer()
def create_dataset(h5file, root, image_size=224, batch_size=100, num_workers=0): path = os.path.dirname(h5file) if not os.path.exists(path): os.makedirs(path) f = h5py.File(h5file, mode='w') try: # Train print('Train data:') data_train = ImageNet(root, 'train', transform=get_transform(image_size)) loader_train = get_loader(data_train, batch_size, num_workers) n_train = len(data_train) val_shape = (n_train, 3, image_size, image_size) f.create_dataset('train_data', val_shape, np.uint8) f.create_dataset('train_targets', (n_train, ), np.int) for i, batch in enumerate(tqdm(loader_train)): inds = list( range(i * batch_size, min((i + 1) * batch_size, n_train))) x = batch[0].detach().cpu().numpy() y = batch[1].detach().cpu().numpy() + 1 f['train_data'][inds, ...] = x f['train_targets'][inds] = y # Validation print('Validation data:') data_val = ImageNet(root, 'val', transform=get_transform(image_size)) loader_val = get_loader(data_val, batch_size, num_workers) n_val = len(data_val) val_shape = (n_val, 3, image_size, image_size) f.create_dataset('val_data', val_shape, np.uint8) f.create_dataset('val_targets', (n_val, ), np.int) for i, batch in enumerate(tqdm(loader_val)): inds = list(range(i * batch_size, min((i + 1) * batch_size, n_val))) x = batch[0].detach().cpu().numpy() y = batch[1].detach().cpu().numpy() + 1 f['val_data'][inds, ...] = x f['val_targets'][inds] = y finally: f.close()
def build(self): train_dt = ImageNet(self.data_dir, split='train', transform=self.train_trans) test_dt = ImageNet(self.data_dir, split='val', transform=self.test_trans) return train_dt, test_dt
def get_dataloader_imagenet(train_batch_size=256, test_batch_size=512, dataset_root_path=IMAGENET_DATASET_ROOT_FOLDER, num_workers=IMAGENET_NUM_WORKERS): ''' Function that build the Imagenet dataloader with classic transform Return a train_loader and valid_loader which are a random split of the train dataset according valid_split Canonical data augmentation is apply on the train_loader Normalization is done for for all loader The valid loader use the test_batch_size Arguments: train_batch_size (int): size of batch to use for train_loader test_batch_size (int): size to use for test_loader, valid_loader num_workers (int): nuber of worker to use ''' dataset_root_path = expanduser(dataset_root_path) # Build regular data transformation train_transforms = v_transforms.Compose([ v_transforms.RandomResizedCrop(224), v_transforms.RandomHorizontalFlip(), v_transforms.ToTensor(), v_transforms.Normalize(mean=MEAN, std=STD), ]) test_transforms = v_transforms.Compose([ v_transforms.Resize(256), v_transforms.CenterCrop(224), v_transforms.ToTensor(), v_transforms.Normalize(mean=MEAN, std=STD), ]) train_dataset = ImageNet( dataset_root_path, split='train', transform=train_transforms, ) valid_dataset = ImageNet(dataset_root_path, split='val', transform=test_transforms, download=True) # We use valid split both as val and test loader data_loader_train = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) data_loader_valid = DataLoader(valid_dataset, batch_size=test_batch_size, num_workers=num_workers, pin_memory=True) return data_loader_train, data_loader_valid, data_loader_valid
def get_train_val_loaders( root_path: str, train_transforms: Callable, val_transforms: Callable, batch_size: int = 16, num_workers: int = 8, val_batch_size: Optional[int] = None, limit_train_num_samples: Optional[int] = None, limit_val_num_samples: Optional[int] = None, ) -> Tuple[DataLoader, DataLoader, DataLoader]: train_ds = ImageNet( root_path, split="train", transform=lambda sample: train_transforms(image=sample)["image"], loader=opencv_loader ) val_ds = ImageNet( root_path, split="val", transform=lambda sample: val_transforms(image=sample)["image"], loader=opencv_loader ) if limit_train_num_samples is not None: np.random.seed(limit_train_num_samples) train_indices = np.random.permutation(len(train_ds))[:limit_train_num_samples] train_ds = Subset(train_ds, train_indices) if limit_val_num_samples is not None: np.random.seed(limit_val_num_samples) val_indices = np.random.permutation(len(val_ds))[:limit_val_num_samples] val_ds = Subset(val_ds, val_indices) # random samples for evaluation on training dataset if len(val_ds) < len(train_ds): np.random.seed(len(val_ds)) train_eval_indices = np.random.permutation(len(train_ds))[: len(val_ds)] train_eval_ds = Subset(train_ds, train_eval_indices) else: train_eval_ds = train_ds train_loader = idist.auto_dataloader( train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers, drop_last=True, ) val_batch_size = batch_size * 4 if val_batch_size is None else val_batch_size val_loader = idist.auto_dataloader( val_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False, ) train_eval_loader = idist.auto_dataloader( train_eval_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False, ) return train_loader, val_loader, train_eval_loader
def get_val_split(self, transform=None, num_samples=None): if num_samples is None: num_samples = self.NUM_VAL_SAMPLES if self.TRN_SET_SIZE - self.TOT_TRN_SAMPLES >= num_samples: # If there are enough samples left in the training set, use them for validation return Subset( ImageNet(root=self.DATASET_FOLDER, split='train', transform=transform), self.indices[self.TRN_SET_SIZE - num_samples:self.TRN_SET_SIZE]) # Validate directly on test set otherwise return Subset( ImageNet(root=self.DATASET_FOLDER, split='val', transform=transform), range(num_samples))
def prepare_imagenet_subset(data_dir: Path, imagenet_dir: Path): print('Preparing ImageNet subset...') random_state = random.Random(42) # Use a predownloaded imagenet. Only validation set is used. imagenet = ImageNet(imagenet_dir, split='val') # Randomly select 10% of the data in each category images = defaultdict(list) for image_path, category_id in imagenet.imgs: images[category_id].append(image_path) # Target root dir subset_dir = data_dir / 'imagenet' shutil.rmtree(subset_dir, ignore_errors=True) subset_dir.mkdir(parents=True) shutil.copyfile(imagenet_dir / 'meta.bin', subset_dir / 'meta.bin') copied_count = 0 for category_id, imgs in images.items(): random_state.shuffle(imgs) for img in imgs[:len(imgs) // 10]: folder_name = Path(img).parent.name file_name = Path(img).name (subset_dir / 'val' / folder_name).mkdir(exist_ok=True, parents=True) shutil.copyfile(img, subset_dir / 'val' / folder_name / file_name) copied_count += 1 print(f'Generated a subset of {copied_count} images.')
def get_imagenet_valset(root=".data", transform=None, target_transform=None): if transform is None: transform = get_imagenet_val_transform() ds = ImageNet(root=root, split="val", transform=transform, target_transform=target_transform) return ds
def setup_dataloader(batch_size: int) -> (DataLoader, DataLoader): '''Setup dataloader for training and validation. ''' train_set = ImageNet(config.imagenet_dataset_root_dir, split='train', download=True, transform=image_transforms) val_set = ImageNet(config.imagenet_dataset_root_dir, split='val', download=True, transform=image_transforms) train_loader = DataLoader(train_set, batch_size=batch_size, **config.dataloader_config) val_loader = DataLoader(val_set, batch_size=batch_size, **config.dataloader_config) return train_loader, val_loader
def prepare_datasets(self, download=False): if download: raise RuntimeError("Can't download ImageNet, sry") # the user should already have imagenet downloaded & extracted, # with meta.bin generated (this should change once the latest # version of pytorch, which knows you can't download ImageNet, # is released): # dataset_dir # |-meta.bin # |-train # | |-wnid1 # | | |-image1.jpg # | | ... # | |-wnid2 # | ... # |-val # | |-wnid1 # | |-wnid2 # | ... # # each wnid is already a client, so # all we need to do in this method is generate the stats.json file # don't use self.vanilla_train/val since those don't exist yet vanilla_train = ImageNet(self.dataset_dir, split="train") vanilla_val = ImageNet(self.dataset_dir, split="val") # the clients must be sorted in the same order that looping over # vanilla_train will go through the classes images_per_client = [] target = -1 for s in vanilla_train.samples: if s[1] != target: images_per_client.append(0) target = s[1] images_per_client[-1] += 1 num_val_images = len(vanilla_val.samples) stats = {"images_per_client": images_per_client, "num_val_images": num_val_images} fn = self.stats_fn() if os.path.exists(fn): raise RuntimeError("won't overwrite existing stats file") with open(fn, "w") as f: json.dump(stats, f)
def imagenet_loader(batch_size, num_workers, datapath, cuda): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) trainset = ImageNet( root=datapath, split='train', download=False, transform=transform_train) valset = ImageNet( root=datapath, split='val', download=False, transform=transform_val) if cuda: train_loader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( valset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) else: train_loader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) val_loader = torch.utils.data.DataLoader( valset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=False) return train_loader, val_loader
def benchmark(model: nn.Module, transform, batch_size=64, device=device, fast: bool = False): valid_dataset = ImageNet(root="/home/zuppif/Downloads/ImageNet", split="val", transform=transform) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=batch_size, shuffle=False, num_workers=12, pin_memory=True, ) evaluator = ImageNetEvaluator(model_name="test", paper_arxiv_id="1905.11946") model.eval().to(device) num_batches = int( math.ceil(len(valid_loader.dataset) / float(valid_loader.batch_size))) start = time.time() with torch.no_grad(): pbar = tqdm(np.arange(num_batches), leave=False) for i_val, (images, labels) in enumerate(valid_loader): images = images.to(device) labels = torch.squeeze(labels.to(device)) net_out = model(images) image_ids = [ get_img_id(img[0]) for img in valid_loader.dataset.imgs[i_val * valid_loader.batch_size:(i_val + 1) * valid_loader.batch_size] ] evaluator.add(dict(zip(image_ids, list(net_out.cpu().numpy())))) pbar.set_description(f"f1={evaluator.top1.avg:.2f}") pbar.update(1) if fast: break pbar.close() stop = time.time() if fast: return evaluator.top1.avg, None, None else: res = evaluator.get_results() return res["Top 1 Accuracy"], res["Top 5 Accuracy"], stop - start
def imagenet1k(): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) dataset_dir = os.path.expanduser('~/Datasets/imagenet') dataset_train = ImageNet(dataset_dir, split='train', transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) dataset_val = ImageNet(dataset_dir, split='val', transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) return dataset_train, dataset_val
def make_loader(path, split, batch_size, num_workers): if split == 'train': transform = Compose([RandomResizedCrop(224,scale=(0.3,4.0/3)), ColorJitter(0.4,0.4,0.4), RandomHorizontalFlip(), ToTensor()]) else: transform = Compose([Resize(256), CenterCrop(224), ToTensor()]) ds = ImageNet(path, split=split, transform=transform) loader = DataLoader(dataset=ds, batch_size=batch_size, shuffle=(split=='train'), num_workers=num_workers) return loader
def __init__(self, root='data', features_name='resnet50', train=True, subsample_to=-1, subsample_seed=123): """ Parameters ---------- root : string Path to the root of the data folder features_names : string Features expected to be in {root}/imagenet/{features_names} train : bool If True, loads the train set subsample_to : int Number of examples to sample subsample_seed: Seed that decides the randomness of the subsampling. """ split = 'train' if train else 'val' # load the base ImageNet object just for reading the class names imagenet_base = ImageNet(os.path.join(root, 'imagenet'), split='val', download=False) group_names = [ '%03d_' % i + e[0].replace(' ', '_') for i, e in enumerate(imagenet_base.classes) ] features_path = os.path.join(root, 'imagenet', features_name + '_features', split + '.pt') data = torch.load(features_path) xy = data['features'], data['targets'] if subsample_to > 0: xy = utils.subsample_arrays(xy, subsample_to, subsample_seed) group_ids = xy[1] inner_dataset = TensorDataset(*xy) self.num_classes = 1000 super().__init__(inner_dataset, group_ids, group_names)
def __init__(self, params, batch_size, num_gpus, is_training): super(IMAGENETReader, self).__init__( params, batch_size, num_gpus, is_training) # Provide square images of this size. self.image_size = self.params.imagenet_image_size if 'efficientnet' in self.params.model: self.image_size = { 'efficientnet-b0': 224, 'efficientnet-b1': 240, 'efficientnet-b2': 260, 'efficientnet-b3': 300, 'efficientnet-b4': 380, 'efficientnet-b5': 456, 'efficientnet-b6': 528, 'efficientnet-b7': 600, }[self.params.model] self.eigval = [0.2175, 0.0188, 0.0045] self.eigvec = [ [-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203], ] self.normalize_mean = [0.485, 0.456, 0.406] self.normalize_std = [0.229, 0.224, 0.225] self.imagenet_normalize = transforms.Normalize( mean=self.normalize_mean, std=self.normalize_std ) self.height, self.width = self.image_size, self.image_size self.n_train_files = 1281167 self.n_test_files = 50000 self.n_classes = 1000 self.img_size = (None, 3, self.height, self.height) split = 'train' if self.is_training else 'val' if 'efficientnet' in self.params.model: transform = self.efficientnet_transform() else: transform = self.transform() self.dataset = ImageNet(self.path, split=split, transform=transform)
def __init__(self, params, batch_size, num_gpus, is_training): super(IMAGENETReader, self).__init__(params, batch_size, num_gpus, is_training) # Provide square images of this size. self.image_size = self.params.imagenet_image_size if 'efficientnet' in self.params.model: self.image_size = { 'efficientnet-b0': 224, 'efficientnet-b1': 240, 'efficientnet-b2': 260, 'efficientnet-b3': 300, 'efficientnet-b4': 380, 'efficientnet-b5': 456, 'efficientnet-b6': 528, 'efficientnet-b7': 600, }[self.params.model] self.imagenet_pca = { 'eigval': [0.2175, 0.0188, 0.0045], 'eigvec': [ [-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203], ] } self.height, self.width = self.image_size, self.image_size self.n_train_files = 1281167 self.n_test_files = 50000 self.n_classes = 1000 self.batch_shape = (None, 3, self.height, self.height) split = 'train' if self.is_training else 'val' if 'efficientnet' in self.params.model: transform = self.efficientnet_transform() else: transform = self.transform() if self.add_noise: transform.transforms.append(AddNoise(self.params)) self.dataset = ImageNet(self.path, split=split, download=False, transform=transform)
def prepare_data(self): train_resize = transforms.Resize((self.hparams.image_size, self.hparams.image_size)) train_normalize = transforms.Normalize(mean=[0.5], std=[0.5]) train_transform = transforms.Compose([train_resize, transforms.ToTensor(), train_normalize]) if self.hparams.dataset == "mnist": self.train_dataset = MNIST(self.hparams.dataset_path, train=True, download=True, transform=train_transform) # self.test_dataset = MNIST(self.hparams.dataset_path, train=False, download=True, transform=test_transform) elif self.hparams.dataset == "fashion_mnist": self.train_dataset = FashionMNIST(self.hparams.dataset_path, train=True, download=True, transform=train_transform) # self.test_dataset = FashionMNIST(self.hparams.dataset_path, train=False, download=True, transform=test_transform) elif self.hparams.dataset == "cifar10": self.train_dataset = CIFAR10(self.hparams.dataset_path, train=True, download=True, transform=train_transform) # self.test_dataset = CIFAR10(self.hparams.dataset_path, train=False, download=True, transform=test_transform) elif self.hparams.dataset == "image_net": self.train_dataset = ImageNet(self.hparams.dataset_path, train=True, download=True, transform=train_transform) # self.test_dataset = ImageNet(self.hparams.dataset_path, train=False, download=True, transform=test_transform) elif self.hparams.dataset == "lsun": self.train_dataset = LSUN(self.hparams.dataset_path + "/lsun", classes=[cls + "_train" for cls in self.hparams.dataset_classes], transform=train_transform) # self.test_dataset = LSUN(self.hparams.dataset_path, classes=[cls + "_test" for cls in self.hparams.dataset_classes], transform=test_transform) elif self.hparams.dataset == "celeba_hq": self.train_dataset = CelebAHQ(self.hparams.dataset_path, image_size=self.hparams.image_size, transform=train_transform) else: raise NotImplementedError("Custom dataset is not implemented yet")
def main(args): model = Model() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma) if args.scheduler == 'multistep': scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.milestones, gamma=args.gamma) elif args.scheduler == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.step_size) criterion = torch.nn.CrossEntropyLoss() model = model.cuda() criterion = criterion.cuda() start_epoch = 0 # Check number of parameters your model pytorch_total_params = sum(p.numel() for p in model.parameters()) print(f"Number of parameters: {pytorch_total_params}") if not os.path.exists('{}'.format(args.savepath)): os.makedirs('{}'.format(args.savepath)) # resume if args.resume: model, optimizer, start_epoch = load_ckpt(model, optimizer, args) # Dataloader if args.dataset == 'cifar10': normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_train.transforms.insert( 0, RandAugment(args.rand_n, args.rand_m)) transform_val = transforms.Compose([ transforms.ToTensor(), normalize, ]) trainset = CIFAR10(root=args.datapath, train=True, download=True, transform=transform_train) valset = CIFAR10(root=args.datapath, train=False, download=True, transform=transform_val) elif args.dataset == 'cifar100': normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.ToTensor(), normalize, ]) trainset = CIFAR100(root=args.datapath, train=True, download=True, transform=transform_train) valset = CIFAR100(root=args.datapath, train=False, download=True, transform=transform_val) elif args.dataset == 'ImageNet': normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.Resize(image_size + 32), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) trainset = ImageNet(root=args.datapath, split='train', download=False, transform=transform_train) valset = ImageNet(root=args.datapath, split='val', download=False, transform=transform_val) elif args.dataeset == 'tiny-imagenet-200': normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.Resize(image_size + 32), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) trainset = ImageFolder(root=args.datapath, split='train', download=False, transform=transform_train) valset = ImageFolder(root=args.datapath, split='val', download=False, transform=transform_val) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=False) val_loader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=False) # start training last_top1_acc = 0 acc1_valid = 0 best_acc1 = 0 is_best = False for epoch in range(start_epoch, args.epochs): print("\n----- epoch: {}, lr: {} -----".format( epoch, optimizer.param_groups[0]["lr"])) # train for one epoch start_time = time.time() last_top1_acc = train(train_loader, epoch, model, optimizer, criterion) elapsed_time = time.time() - start_time print('==> {:.2f} seconds to train this epoch\n'.format(elapsed_time)) # validate for one epoch start_time = time.time() acc1_valid = validate(val_loader, model, criterion) elapsed_time = time.time() - start_time print( '==> {:.2f} seconds to validate this epoch\n'.format(elapsed_time)) # learning rate scheduling scheduler.step() summary = [epoch, last_top1_acc, acc1_valid.item()] is_best = acc1_valid > best_acc1 best_acc1 = max(acc1_valid, best_acc1) save_summary('rexnetv1', args.dataset, args.name, summary) checkpoint = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } save_ckpt(checkpoint, is_best, args) #if is_best: # torch.save(model.state_dict(), args.savepath+'model_weight_best.pth') # Save model each epoch #torch.save(model.state_dict(), args.savepath+'model_weight_epoch{}.pth'.format(epoch)) print(f"Last Top-1 Accuracy: {last_top1_acc}") print(f"Best valid Top-1 Accuracy: {best_acc1}") print(f"Number of parameters: {pytorch_total_params}")
def get_test_split(self, transform=None, num_samples=None): if num_samples is None: num_samples = self.NUM_TST_SAMPLES return Subset( ImageNet(root=self.DATASET_FOLDER, split='val', transform=transform), range(num_samples))
def get_train_split(self, transform=None, num_samples=None): if num_samples is None: num_samples = self.TOT_TRN_SAMPLES return Subset( ImageNet(root=self.DATASET_FOLDER, split='train', transform=transform), self.indices[:num_samples])
def get_dataset(path: Path, transforms: Callable = None) -> Dataset: return ImageNet(str(path), split='val', transform=transforms)
def _make_data_loader_imagenet(root, batch_size, workers=4, is_train=True, download=False, distributed=False): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) logger = logging.getLogger('octconv') if is_train: logger.info("Loading ImageNet training data") st = time.time() scale = (0.08, 1.0) transform = transforms.Compose([ transforms.RandomResizedCrop(224, scale=scale), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) dataset = ImageNet(root=root, split='train', download=download, transform=transform) logger.info("Took: {}".format(time.time() - st)) if distributed: sampler = torch.utils.data.distributed.DistributedSampler(dataset) else: sampler = torch.utils.data.RandomSampler(dataset) loader = DataLoader(dataset, batch_size=batch_size, num_workers=workers, sampler=sampler, pin_memory=True) else: logger.info("Loading ImageNet validation data") transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) dataset = ImageNet(root=root, split='val', download=download, transform=transform) if distributed: sampler = torch.utils.data.distributed.DistributedSampler(dataset) else: sampler = torch.utils.data.SequentialSampler(dataset) loader = DataLoader(dataset, batch_size=batch_size, num_workers=workers, sampler=sampler, pin_memory=True) return loader
def load_dataset(dataset_name, **kwargs): """ Loads the specified dataset and returns a PyTorch dataset object. Applies the standard transformations for said dataset by default. """ data_path = pathlib.Path('data').resolve() if dataset_name == 'cifar10': from torchvision.datasets import CIFAR10 # This is the standard normalization transformation transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) # User can specify to load the training set; loads the test set by default. train = kwargs.pop('train', False) dataset = CIFAR10(data_path, train=train, transform=transform, download=True) elif dataset_name == 'cifar100': from torchvision.datasets import CIFAR100 # This is the standard normalization transformation transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) # User can specify to load the training set; loads the test set by default. train = kwargs.pop('train', False) dataset = CIFAR100(data_path, train=train, transform=transform, download=True) elif dataset_name == 'imagenet': # Requires imagenet to be downloaded locally from torchvision.datasets import ImageNet # Standard transformation transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) dataset = ImageNet(data_path / 'imagenet', split='val', transform=transform) elif dataset_name == 'cifar10r': from data.nonstationary_datasets import CIFAR10R dataset = CIFAR10R() elif dataset_name == 'cifar100r': from data.nonstationary_datasets import CIFAR100R dataset = CIFAR100R() elif dataset_name == 'cifar10gb': from data.nonstationary_datasets import CIFAR10GB dataset = CIFAR10GB() elif dataset_name == 'cifar100gb': from data.nonstationary_datasets import CIFAR100GB dataset = CIFAR100GB() elif dataset_name == 'cifar10imba': from data.imbalanced_datasets import CIFAR10Imba dataset = CIFAR10Imba(class_ratios=kwargs['class_ratios']) else: raise NotImplementedError return dataset
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = getattr(resnet, args.arch)() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code img_net = ImageNet(args.data) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
import torch from torch.utils.data import DataLoader from torchvision.datasets import ImageNet print("Pytorch version:{}".format(torch.__version__)) #useful links #https://pytorch.org/docs/stable/_modules/torchvision/datasets/imagenet.html#ImageNet #https://pytorch.org/docs/stable/torchvision/datasets.html#imagenet #%% """ #_________________________________Dataloader for ImageNet______________________________# # # # The source code of the class ImageNet from pytorch says # # # # The dataset is no longer publicly accessible. # # You need to download the archives externally and place them in the root directory. # # # # I couldn't download the image because it takes 5 days to check a new account on # # the ImageNet website so i just implement the dataloader with the class ImageNet # # from pytorch # # # #______________________________________________________________________________________# """ root_dir = "path_to_ImageNet_directory_file" data = ImageNet(root=root_dir) data_loader = DataLoader(data, batch_size=1)
#input_transform = transforms.Compose([ # transforms.Resize(224,PIL.Image.BICUBIC), transforms.ToTensor(), # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #]) input_transform = transforms.Compose([ transforms.Resize(256, PIL.Image.BICUBIC), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) test_dataset = ImageNet( DATA_ROOT, split="val", transform=input_transform, target_transform=None, download=True, ) test_loader = DataLoader( test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, ) model = model.cuda() model.eval()
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3) ################################################ # Dataset and train-test helpers ################################################ transform_val = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) data_root = args.IMAGENET_DIR train_ds = ImageNet(data_root, split='train', download=False, transform=transform_val) train_loader = torch.utils.data.DataLoader(train_ds, batch_size=args.BATCHSIZE, shuffle=True, drop_last=False, num_workers=args.NUM_WORKERS, pin_memory=True) val_ds = ImageNet(data_root, split='val', download=False, transform=transform_val) val_loader = torch.utils.data.DataLoader(val_ds, batch_size=args.BATCHSIZE, shuffle=True,
def create_dataset(name, root, split='validation', search_split=True, class_map=None, load_bytes=False, is_training=False, download=False, batch_size=None, repeats=0, **kwargs): """ Dataset factory method In parenthesis after each arg are the type of dataset supported for each arg, one of: * folder - default, timm folder (or tar) based ImageDataset * torch - torchvision based datasets * TFDS - Tensorflow-datasets wrapper in IterabeDataset interface via IterableImageDataset * all - any of the above Args: name: dataset name, empty is okay for folder based datasets root: root folder of dataset (all) split: dataset split (all) search_split: search for split specific child fold from root so one can specify `imagenet/` instead of `/imagenet/val`, etc on cmd line / config. (folder, torch/folder) class_map: specify class -> index mapping via text file or dict (folder) load_bytes: load data, return images as undecoded bytes (folder) download: download dataset if not present and supported (TFDS, torch) is_training: create dataset in train mode, this is different from the split. For Iterable / TDFS it enables shuffle, ignored for other datasets. (TFDS) batch_size: batch size hint for (TFDS) repeats: dataset repeats per iteration i.e. epoch (TFDS) **kwargs: other args to pass to dataset Returns: Dataset object """ name = name.lower() if name.startswith('torch/'): name = name.split('/', 2)[-1] torch_kwargs = dict(root=root, download=download, **kwargs) if name in _TORCH_BASIC_DS: ds_class = _TORCH_BASIC_DS[name] use_train = split in _TRAIN_SYNONYM ds = ds_class(train=use_train, **torch_kwargs) elif name == 'inaturalist' or name == 'inat': assert has_inaturalist, 'Please update to PyTorch 1.10, torchvision 0.11+ for Inaturalist' target_type = 'full' split_split = split.split('/') if len(split_split) > 1: target_type = split_split[0].split('_') if len(target_type) == 1: target_type = target_type[0] split = split_split[-1] if split in _TRAIN_SYNONYM: split = '2021_train' elif split in _EVAL_SYNONYM: split = '2021_valid' ds = INaturalist(version=split, target_type=target_type, **torch_kwargs) elif name == 'places365': assert has_places365, 'Please update to a newer PyTorch and torchvision for Places365 dataset.' if split in _TRAIN_SYNONYM: split = 'train-standard' elif split in _EVAL_SYNONYM: split = 'val' ds = Places365(split=split, **torch_kwargs) elif name == 'imagenet': if split in _EVAL_SYNONYM: split = 'val' ds = ImageNet(split=split, **torch_kwargs) elif name == 'image_folder' or name == 'folder': # in case torchvision ImageFolder is preferred over timm ImageDataset for some reason if search_split and os.path.isdir(root): # look for split specific sub-folder in root root = _search_split(root, split) ds = ImageFolder(root, **kwargs) else: assert False, f"Unknown torchvision dataset {name}" elif name.startswith('tfds/'): ds = IterableImageDataset(root, parser=name, split=split, is_training=is_training, download=download, batch_size=batch_size, repeats=repeats, **kwargs) else: # FIXME support more advance split cfg for ImageFolder/Tar datasets in the future if search_split and os.path.isdir(root): # look for split specific sub-folder in root root = _search_split(root, split) ds = ImageDataset(root, parser=name, class_map=class_map, load_bytes=load_bytes, **kwargs) return ds