def blobs(data_dir, batch_size, mode="base", normalize=True, norm_layer=None, size=32): """ Minimal version since we use this dataset only for OOD evaluation. """ data = np.float32( np.random.binomial(n=1, p=0.7, size=(10000, size, size, 3))) for i in range(10000): data[i] = gblur(data[i], sigma=1.5, multichannel=False) data[i][data[i] < 0.75] = 0.0 dummy_targets = torch.ones(10000) data = torch.cat([ norm_layer(x).unsqueeze(0) for x in torch.from_numpy(data.transpose((0, 3, 1, 2))) ]) dataset = torch.utils.data.TensorDataset(data, dummy_targets) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) return 0, loader, 0
def _DataGenerator(self, dataName, num_examples): """ create synthetic data using numpy """ if dataName == "gaussian": # prepare gaussian data data = np.clip( np.random.normal(size=(num_examples, self.size, self.size, 3), scale=0.5), -1, 1).astype(np.float32) elif dataName == "rademacher": # prepare rademacher data data = np.random.binomial( n=1, p=0.5, size=(num_examples, self.size, self.size, 3)).astype(np.float32) * 2 - 1 elif dataName == "blob": data = np.random.binomial(n=1, p=0.7, size=(num_examples, self.size, self.size, 3)).astype(np.float32) for i in range(num_examples): data[i] = gblur(data[i], sigma=1.5, multichannel=False) data[i][data[i] < 0.75] = 0.0 label = np.random.randint(low=0, high=10, size=num_examples, dtype=np.int64) return data, label
def __init__(self, size=32, batch_sizes={'ood': 100}, noise_type='Gaussian', dataset_len=2000): super(Noise, self).__init__(None, size, batch_sizes, {'ood': None}) if isinstance(size, tuple): h, w = size else: h = w = size dummy_targets = torch.ones(dataset_len) ''' Original source of noise formulation from https://github.com/hendrycks/outlier-exposure ''' if noise_type == 'Gaussian': noise_data = torch.from_numpy( np.float32( np.clip( np.random.normal(size=(dataset_len, 3, h, w), scale=0.5), -1, 1))) elif noise_type == 'Rademacher': noise_data = torch.from_numpy( np.random.binomial(n=1, p=0.5, size=( dataset_len, 3, h, w)).astype(np.float32)) * 2 - 1 elif noise_type == 'Blob': from skimage.filters import gaussian as gblur noise_data = np.float32( np.random.binomial(n=1, p=0.7, size=(dataset_len, h, w, 3))) for i in range(dataset_len): noise_data[i] = gblur(noise_data[i], sigma=1.5, multichannel=False) noise_data[i][noise_data[i] < 0.75] = 0.0 noise_data = torch.from_numpy(noise_data.transpose( (0, 3, 1, 2))) * 2 - 1 noise_data = torch.utils.data.TensorDataset(noise_data, dummy_targets) self.loader = torch.utils.data.DataLoader( noise_data, batch_size=batch_sizes['ood'], shuffle=True, num_workers=12, pin_memory=True) self.noise_type = noise_type self.name = "{}_{}".format(self.name, noise_type)
ood_data = torch.utils.data.TensorDataset(ood_data, dummy_targets) ood_loader = torch.utils.data.DataLoader(ood_data, batch_size=args.test_bs, shuffle=True) print('\n\nRademacher Noise Calibration') get_and_print_results(ood_loader) # /////////////// Blob /////////////// ood_data = np.float32( np.random.binomial(n=1, p=0.7, size=(ood_num_examples * args.num_to_avg, 32, 32, 3))) for i in range(ood_num_examples * args.num_to_avg): ood_data[i] = gblur(ood_data[i], sigma=1.5, multichannel=False) ood_data[i][ood_data[i] < 0.75] = 0.0 dummy_targets = torch.ones(ood_num_examples * args.num_to_avg) ood_data = torch.from_numpy(ood_data.transpose((0, 3, 1, 2))) * 2 - 1 ood_data = torch.utils.data.TensorDataset(ood_data, dummy_targets) ood_loader = torch.utils.data.DataLoader(ood_data, batch_size=args.test_bs, shuffle=True, num_workers=args.prefetch, pin_memory=True) print('\n\nBlob Calibration') get_and_print_results(ood_loader) # /////////////// Textures ///////////////
def get_dataset(name, split='train', transform=None, target_transform=None, download=True, datasets_path=__DATASETS_DEFAULT_PATH, limit=None, shuffle_before_limit=False, limit_shuffle_seed=None, class_ids=None, per_class_limit=True, split_start=0): train = (split == 'train') if '+' in name: ds = None for ds_name in name.split('+'): ds_ = get_dataset(ds_name, split, transform, target_transform, download, limit=limit, shuffle_before_limit=shuffle_before_limit, datasets_path=__DATASETS_DEFAULT_PATH, split_start=split_start) if ds is None: ds = ds_ else: ds += ds_ ds.targets = ds.datasets[0].targets + ds.datasets[1].targets if limit or class_ids: ds = limit_ds(ds, limit, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids, split_start=split_start) return ds if name.endswith('_3c'): transform = tv.transforms.Compose([ tv.transforms.ToTensor(), lambda x: x.repeat(3, 1, 1), tv.transforms.ToPILImage(), transform ]) name = name[:-3] if name.endswith('-raw'): ds_dir_name = name[:-4] elif name.startswith('folder-'): ds_dir_name = name[7:] elif name == 'places365_standard-lsun': ds_dir_name = 'places365_standard' name = ds_dir_name class_ids = filter( lambda x: x not in [52, 66, 91, 92, 102, 121, 203, 215, 284, 334], range(365)) elif name.startswith('DomainNet-'): parts = name.split('-') domain = parts[1] if name.endswith('-measure') and train: ds_dir_name = os.path.join('DomainNet', 'measure', domain) else: ds_dir_name = os.path.join('DomainNet', 'train' if train else 'test', domain) name = name.replace('-measure', '') if name.endswith('-A') or name.endswith('-B'): set = parts[2] class_ids = range(173) if set == 'A' else range(173, 345) elif name.endswith('-dogs') or name.endswith('-cats'): if name.startswith('imagenet-'): if name.endswith('dogs'): _ids = _imagenet_dogs.keys() else: _ids = _imagenet_cats.keys() else: _ids = [1] if name.endswith('dogs') else [0] return get_dataset(name[:-5], split, transform, target_transform, download, limit=limit, shuffle_before_limit=True, datasets_path=__DATASETS_DEFAULT_PATH, class_ids=_ids, per_class_limit=False, limit_shuffle_seed=0, split_start=split_start) elif name.startswith('imagine-'): if train: ds_dir_name = None for i_cfg in _IMAGINE_CONFIGS: idx = name.find(i_cfg) if idx > 0: ds_dir_name = os.path.join(name[:idx - 1], i_cfg, name[idx + len(i_cfg) + 1:]) #print(ds_dir_name) break assert ds_dir_name is not None else: return get_dataset(name.split('-')[1], split, transform, target_transform, download, limit=limit, shuffle_before_limit=shuffle_before_limit, datasets_path=__DATASETS_DEFAULT_PATH, split_start=split_start) else: ds_dir_name = name root = os.path.join(datasets_path, ds_dir_name) if name == 'cifar10': return datasets.CIFAR10(root=root, train=train, transform=transform, target_transform=target_transform, download=download) elif name == 'cifar100': return datasets.CIFAR100(root=root, train=train, transform=transform, target_transform=target_transform, download=download) elif name == 'cifar10.1': return CIFAR10_1(root=root, transform=transform, target_transform=target_transform, download=download) elif name.startswith('cifar10_custom'): ds_ = [] if 'val' in split: ds_.append( get_dataset('cifar10', split='val', transform=transform, download=download, target_transform=target_transform)) if '10.1' in split: ds_.append( get_dataset('cifar10.1', split='', transform=transform, download=download, target_transform=target_transform)) if 'train' in split: ds_.append( get_dataset('cifar10', split='train', transform=transform, download=download, target_transform=target_transform)) if 'ext' in split: ds_.append( get_dataset('folder-cifar10_ext', split='val', transform=transform, download=download, target_transform=target_transform)) ds = ds_[0] for d in ds_[1:]: ds += d return limit_ds(ds, limit=limit, split_start=split_start, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids) elif name.lower().startswith('svhn_custom'): ds_ = [] if 'val' in split or 'test' in split: ds_.append( get_dataset('SVHN', split='test', transform=transform, download=download, target_transform=target_transform)) if 'train' in split: ds_.append( get_dataset('SVHN', split='train', transform=transform, download=download, target_transform=target_transform)) ds = ds_[0] for d in ds_[1:]: ds += d return limit_ds(ds, limit=limit, split_start=split_start, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids) elif name.startswith('cifar100_custom'): ds_ = [] if 'val' in split or 'test' in split: ds_.append( get_dataset('cifar100', split='test', transform=transform, download=download, target_transform=target_transform)) if 'train' in split: ds_.append( get_dataset('cifar100', split='train', transform=transform, download=download, target_transform=target_transform)) ds = ds_[0] for d in ds_[1:]: ds += d return limit_ds(ds, limit=limit, split_start=split_start, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids) elif name == 'mnist' or name == 'mnist_3c': return datasets.MNIST(root=root, train=train, transform=transform, target_transform=target_transform, download=download) elif name == 'SVHN': return datasets.SVHN(root=root, split='test' if not train else 'train', transform=transform, target_transform=target_transform, download=download) elif 'stl10' in name: if train and name.endswith('train_test'): return datasets.STL10(root=root, split='train', transform=transform, target_transform=target_transform, download=download) + datasets.STL10( root=root, split='test', transform=transform, target_transform=target_transform, download=download) return datasets.STL10(root=root, split=split, transform=transform, target_transform=target_transform, download=download) elif name == 'LSUN': return datasets.LSUN(root=root, classes=split, transform=transform, target_transform=target_transform) elif name.startswith('folder'): ds = datasets.ImageFolder(root=root, transform=transform, target_transform=target_transform) if limit or class_ids: ds = limit_ds(ds, limit, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids, split_start=split_start) return ds elif name in ['imagenet', 'cats_vs_dogs', 'places365_standard'] or any( i in name for i in ['imagine-', '-raw']): if train: root = os.path.join(root, 'train') else: root = os.path.join(root, 'val') ds = datasets.ImageFolder(root=root, transform=transform, target_transform=target_transform) if limit or class_ids: if 'no_dd' in name: ds = limit_ds(ds, limit * len(ds.classes), per_class=False, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, split_start=split_start) else: ds = limit_ds(ds, limit, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids, split_start=split_start) return ds elif name.startswith('DomainNet-'): ds = datasets.ImageFolder(root=root, transform=transform, target_transform=target_transform) if limit or class_ids: return limit_ds(ds, limit=limit, per_class=per_class_limit, shuffle=shuffle_before_limit, seed=limit_shuffle_seed, allowed_classes=class_ids) return ds elif name.startswith('random-'): np.random.seed(limit_shuffle_seed) n_samples = limit or 10000 dummy_targets = torch.ones(n_samples) ds_name = name[7:] if name.endswith('-normal'): mean, std = [0., 0., 0.], [1., 1., 1.] return RandomDatasetGenerator([3, 512, 512], mean, std, limit=n_samples, transform=transform, train=train) use_random_test = False if name.endswith('-rt'): use_random_test = True ds_name = ds_name[:-3] if ds_name in _DATASET_META_DATA: meta = _DATASET_META_DATA[ds_name] nclasses, data_shape, mean, std = meta.get_attrs().values() ## borrowed from https://github.com/hendrycks/outlier-exposure/ elif ds_name == 'gaussian': samples = torch.from_numpy( np.clip( np.random.normal(size=(n_samples, 3, 32, 32), loc=0.5, scale=0.5).astype(np.float32), 0, 1)) return torch.utils.data.TensorDataset(samples, dummy_targets) elif ds_name == 'rademacher' or ds_name == 'bernoulli': samples = torch.from_numpy( np.random.binomial(n=1, p=0.5, size=(n_samples, 3, 32, 32)).astype(np.float32)) if ds_name == 'rademacher': samples = samples * 2 - 1 return torch.utils.data.TensorDataset(samples, dummy_targets) elif ds_name == 'blob': from skimage.filters import gaussian as gblur samples = np.float32( np.random.binomial(n=1, p=0.7, size=(n_samples, 32, 32, 3))) for i in range(n_samples): samples[i] = gblur(samples[i], sigma=1.5, multichannel=False) samples[i][samples[i] < 0.75] = 0.0 samples = torch.from_numpy(samples.transpose((0, 3, 1, 2))) return torch.utils.data.TensorDataset(samples, dummy_targets) else: raise NotImplementedError limit = limit or 1000 if per_class_limit: limit = limit * nclasses if train or use_random_test: return RandomDatasetGenerator(data_shape, mean, std, limit=limit, transform=transform, train=train) else: return get_dataset(ds_name, split, transform, target_transform, download, limit=limit, shuffle_before_limit=shuffle_before_limit, datasets_path=__DATASETS_DEFAULT_PATH, split_start=split_start) elif hasattr(datasets, name): return getattr(datasets, name)(root=datasets_path, train=train, transform=transform, target_transform=target_transform, download=download)