def download(self): if self._check_integrity(): return super(FC100ClassDataset, self).download() subfolder = os.path.join(self.root, self.subfolder) if not os.path.exists(subfolder): os.makedirs(subfolder) filename_fine_names = os.path.join(self.root, self.filename_fine_names) with open(filename_fine_names, 'r') as f: fine_names = json.load(f) for split in ['train', 'val', 'test']: split_filename_labels = os.path.join( subfolder, self.filename_labels.format(split)) if os.path.isfile(split_filename_labels): continue data = get_asset(self.folder, self.subfolder, '{0}.json'.format(split), dtype='json') with open(split_filename_labels, 'w') as f: labels = [[coarse_name, fine_name] for coarse_name in data for fine_name in fine_names[coarse_name]] json.dump(labels, f)
def download(self): import zipfile import shutil if self._check_integrity(): return for name in self.zips_md5: zip_filename = '{0}.zip'.format(name) filename = os.path.join(self.root, zip_filename) if os.path.isfile(filename): continue url = '{0}/{1}'.format(self.download_url_prefix, zip_filename) download_url(url, self.root, zip_filename, self.zips_md5[name]) with zipfile.ZipFile(filename, 'r') as f: f.extractall(self.root) filename = os.path.join(self.root, self.filename) with h5py.File(filename, 'w') as f: for name in self.zips_md5: group = f.create_group(name) alphabets = list_dir(os.path.join(self.root, name)) characters = [(name, alphabet, character) for alphabet in alphabets for character in list_dir(os.path.join(self.root, name, alphabet))] split = 'train' if name == 'images_background' else 'test' labels_filename = os.path.join(self.root, self.filename_labels.format('', split)) with open(labels_filename, 'w') as f_labels: labels = sorted(characters) json.dump(labels, f_labels) for _, alphabet, character in characters: filenames = glob.glob(os.path.join(self.root, name, alphabet, character, '*.png')) dataset = group.create_dataset('{0}/{1}'.format(alphabet, character), (len(filenames), 105, 105), dtype='uint8') for i, char_filename in enumerate(filenames): image = Image.open(char_filename, mode='r').convert('L') dataset[i] = ImageOps.invert(image) shutil.rmtree(os.path.join(self.root, name)) for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename_labels.format( 'vinyals_', split)) data = get_asset(self.folder, '{0}.json'.format(split), dtype='json') with open(filename, 'w') as f: labels = sorted([('images_{0}'.format(name), alphabet, character) for (name, alphabets) in data.items() for (alphabet, characters) in alphabets.items() for character in characters]) json.dump(labels, f)
def download(self): import zipfile import shutil import glob from tqdm import tqdm if self._check_integrity(): return zip_filename = os.path.join(self.root, self.zip_filename) if not os.path.isfile(zip_filename): download_file_from_google_drive(self.gdrive_id, self.root, self.zip_filename, md5=self.zip_md5) zip_foldername = os.path.join(self.root, self.image_folder) if not os.path.isdir(zip_foldername): with zipfile.ZipFile(zip_filename, 'r') as f: for member in tqdm(f.infolist(), desc='Extracting '): try: f.extract(member, self.root) except zipfile.BadZipFile: print('Error: Zip file is corrupted') for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename.format(split)) if os.path.isfile(filename): continue labels = get_asset(self.folder, '{0}.json'.format(split)) labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(labels, f) image_folder = os.path.join(zip_foldername, split) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') dtype = h5py.special_dtype(vlen=np.uint8) for i, label in enumerate(tqdm(labels, desc=filename)): images = glob.glob( os.path.join(image_folder, label, '*.png')) images.sort() dataset = group.create_dataset(label, (len(images), ), dtype=dtype) for i, image in enumerate(images): with open(image, 'rb') as f: array = bytearray(f.read()) dataset[i] = np.asarray(array, dtype=np.uint8) if os.path.isdir(zip_foldername): shutil.rmtree(zip_foldername)
def download(self): import tarfile import shutil import glob from tqdm import tqdm if self._check_integrity(): return tgz_filename = os.path.join(self.root, self.tgz_filename) if not os.path.isfile(tgz_filename): download_file_from_google_drive(self.gdrive_id, self.root, self.tgz_filename, md5=self.tgz_md5) tgz_filename = os.path.join(self.root, self.tgz_filename) with tarfile.open(tgz_filename, 'r') as f: f.extractall(self.root) image_folder = os.path.join(self.root, self.image_folder) for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename.format(split)) if os.path.isfile(filename): continue labels = get_asset(self.folder, '{0}.json'.format(split)) labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(labels, f) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') dtype = h5py.special_dtype(vlen=np.uint8) for i, label in enumerate(tqdm(labels, desc=filename)): images = glob.glob( os.path.join(image_folder, label, '*.jpg')) images.sort() dataset = group.create_dataset(label, (len(images), ), dtype=dtype) for i, image in enumerate(images): with open(image, 'rb') as f: array = bytearray(f.read()) dataset[i] = np.asarray(array, dtype=np.uint8) tar_folder, _ = os.path.splitext(tgz_filename) if os.path.isdir(tar_folder): shutil.rmtree(tar_folder) attributes_filename = os.path.join(self.root, 'attributes.txt') if os.path.isfile(attributes_filename): os.remove(attributes_filename)
def download(self): if self._check_integrity(): return from sklearn.datasets import fetch_openml data = fetch_openml(data_id=self.open_ml_id) features = data.data targets = data.target os.makedirs(self.root, exist_ok=True) # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask). # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features. for s, split in enumerate(['train', 'val', 'test']): labels_assets_split = get_asset(self.folder, '{0}.json'.format(split)) is_in_split = [t in labels_assets_split for t in targets] features_split = features.loc[is_in_split] targets_split = targets.loc[is_in_split] assert targets_split.shape[0] == features_split.shape[0] unique_targets_split = np.unique(targets_split) if len(labels_assets_split) > unique_targets_split.shape[0]: print( f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels " f"given by assets ({len(labels_assets_split)}). Proceeding with unique set of labels." ) # write unique targets to json file. labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(unique_targets_split.tolist(), f) # write data (features and class labels) filename = os.path.join(self.root, self.filename.format(split)) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') for i, label in enumerate( tqdm(unique_targets_split, desc=filename)): data_class = features_split.loc[targets_split == label] group.create_dataset(label, data=data_class)
def download(self): if self._check_integrity(): return super(CIFARFSClassDataset, self).download() subfolder = os.path.join(self.root, self.subfolder) if not os.path.exists(subfolder): os.makedirs(subfolder) for split in ['train', 'val', 'test']: split_filename_labels = os.path.join( subfolder, self.filename_labels.format(split)) if os.path.isfile(split_filename_labels): continue data = get_asset(self.folder, self.subfolder, '{0}.json'.format(split), dtype='json') with open(split_filename_labels, 'w') as f: json.dump(data, f)
def get_task_id_splits(meta_split): return get_asset(TCGA.folder, '{}.json'.format(meta_split), dtype='json')
def get_task_variables(): return get_asset(TCGA.folder, 'task_variables.json', dtype='json')
def get_cancers(): return get_asset(TCGA.folder, 'cancers.json', dtype='json')
def download(self, chunksize=100): try: import gzip import shutil import pandas as pd from six.moves import urllib import academictorrents as at except ImportError as exception: raise ImportError('{0}. To use the TCGA dataset, you need to ' 'install the necessary dependencies with ' '`pip install torchmeta[tcga]`.'.format( exception.message)) clinical_matrices_folder = os.path.join(self.root, 'clinicalMatrices') if not os.path.exists(clinical_matrices_folder): os.makedirs(clinical_matrices_folder) for cancer in self.cancers: filename = self.clinical_matrix_filename.format(cancer) rawpath = os.path.join(clinical_matrices_folder, '{0}.gz'.format(filename)) filepath = os.path.join(clinical_matrices_folder, '{0}.tsv'.format(filename)) if os.path.isfile(filepath): continue if not os.path.exists(rawpath): print('Downloading `{0}.gz`...'.format(filename)) url = self.clinical_matrix_url.format(cancer) urllib.request.urlretrieve(url, rawpath) print('Extracting `{0}.gz`...'.format(filename)) with gzip.open(rawpath, 'rb') as gzf: with open(filepath, 'wb') as f: shutil.copyfileobj(gzf, f) gene_expression_file = os.path.join(self.root, self.gene_expression_filename) if not os.path.isfile(gene_expression_file): from tqdm import tqdm print('Downloading `{0}` using `academictorrents`...'.format( self.gene_expression_filename)) csv_file = at.get(self.gene_expression_torrent, datastore=self.root) print('Downloaded to: `{0}`'.format(csv_file)) print( 'Converting TCGA CSV dataset to HDF5. This may take a while, ' 'but only happens on the first run.') reader = pd.read_csv(csv_file, compression='gzip', sep='\t', header=0, index_col=0, chunksize=chunksize) shape = (10459, 20530) with tqdm(total=shape[1]) as pbar: with h5py.File(gene_expression_file, 'w') as f: dataset = f.create_dataset('expression_data', shape=shape, dtype='f4') gene_ids = [] for idx, chunk in enumerate(reader): slice_ = slice(idx * chunksize, (idx + 1) * chunksize) dataset[:, slice_] = chunk.T gene_ids.extend(chunk.index) pbar.update(chunk.shape[0]) all_sample_ids = chunk.columns.tolist() gene_ids_file = os.path.join(self.root, 'gene_ids.json') with open(gene_ids_file, 'w') as f: json.dump(gene_ids, f) all_sample_ids_file = os.path.join(self.root, 'all_sample_ids.json') with open(all_sample_ids_file, 'w') as f: json.dump(all_sample_ids, f) if os.path.isfile(csv_file): os.remove(csv_file) print('Done') self._process_clinical_matrices() # Create label files for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename_tasks.format(split)) data = get_asset(self.folder, '{0}.json'.format(split), dtype='json') with open(filename, 'w') as f: labels = sorted([key.split('|', 1) for key in data]) json.dump(labels, f) # Clean up for cancer in self.cancers: filename = self.clinical_matrix_filename.format(cancer) rawpath = os.path.join(clinical_matrices_folder, '{0}.gz'.format(filename)) if os.path.isfile(rawpath): os.remove(rawpath)
def download(self, normalize): if self._check_integrity(): return from sklearn.datasets import fetch_openml data = fetch_openml(data_id=self.open_ml_id) features = data.data targets = data.target os.makedirs(self.root, exist_ok=True) # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask). # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features. for s, split in enumerate(['train', 'val', 'test']): targets_assets_split = get_asset(self.folder, '{0}.json'.format(split)) is_in_split = [t in targets_assets_split for t in targets] features_split = features.loc[is_in_split] targets_split = targets.loc[is_in_split] assert targets_split.shape[0] == features_split.shape[0] unique_targets_split = np.unique(targets_split) if len(targets_assets_split) > unique_targets_split.shape[0]: print( f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels " f"given by assets ({len(targets_assets_split)}). Proceeding with unique set of labels." ) # write unique targets to json file. labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(unique_targets_split.tolist(), f) # normalize between 0 and 1 with stats from 'train' split only if split == 'train': lower, upper = np.zeros(features.shape[1]), np.ones( features.shape[1]) if normalize: lower = np.min(features_split, axis=0) upper = np.max(features_split, axis=0) self._lower_upper = { 'lower': lower.tolist(), 'upper': upper.tolist() } lower_upper_filename = os.path.join(self.root, self.filename_lower_upper) with open(lower_upper_filename, 'w') as f: json.dump(self._lower_upper, f) lower_upper = self.lower_upper lower = np.array(lower_upper['lower']) upper = np.array(lower_upper['upper']) features_split = np.true_divide((features_split - lower), (upper - lower)) # write data (features and class labels) filename = os.path.join(self.root, self.filename.format(split)) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') for i, label in enumerate( tqdm(unique_targets_split, desc=filename)): data_class = features_split.loc[targets_split == label] group.create_dataset(label, data=data_class)
def download(self): import tarfile if self._check_integrity(): return chunkSize = 1024 r = requests.get(self.train_tar_url, stream=True) with open(self.root + '/cars_train.tgz', 'wb') as f: pbar = tqdm(unit="B", total=int(r.headers['Content-Length'])) for chunk in r.iter_content(chunk_size=chunkSize): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) r = requests.get(self.devkit_tar_url, stream=True) with open(self.root + '/car_devkit.tgz', 'wb') as f: pbar = tqdm(unit="B", total=int(r.headers['Content-Length'])) for chunk in r.iter_content(chunk_size=chunkSize): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) filename = os.path.join(self.root, 'cars_train.tgz') with tarfile.open(filename, 'r') as f: f.extractall(self.root) filename = os.path.join(self.root, 'car_devkit.tgz') with tarfile.open(filename, 'r') as f: f.extractall(self.root) annos_path = os.path.join(self.root, 'devkit', 'cars_train_annos.mat') cars_meta_path = os.path.join(self.root, 'devkit', 'cars_meta.mat') annos = loadmat(annos_path)['annotations'][0] cars_meta = loadmat(cars_meta_path)['class_names'][0] cars_meta = [c[0] for c in cars_meta] names_to_bboxes = {} clss_to_names = collections.defaultdict(list) for xmin, ymin, xmax, ymax, label, filename in annos: bbox = (int(xmin[0][0]), int(ymin[0][0]), int(xmax[0][0]), int(ymax[0][0])) label = int(label[0][0]) - 1 filename = str(filename[0]) names_to_bboxes[filename] = bbox clss_to_names[cars_meta[label]].append(filename) for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename.format(split)) labels = get_asset(self.folder, '{}.json'.format(split)) labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(labels, f) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') for i, label in enumerate(tqdm(labels, desc=filename)): images = [] for file in clss_to_names[label]: file_path = os.path.join(self.root, 'cars_train', file) img = Image.open(file_path).convert('RGB') bbox = names_to_bboxes[file] img = np.asarray(img.crop(bbox).resize((84, 84)), dtype=np.uint8) images.append(img) dataset = group.create_dataset(label, (len(images), 84, 84, 3)) for j, image in enumerate(images): dataset[j] = image
def download(self, normalize): if self._check_integrity(): return from sklearn.datasets import fetch_openml data = fetch_openml(data_id=self.open_ml_id) features = data.data targets = data.target os.makedirs(self.root, exist_ok=True) # for each meta-data-split, get the labels, then check which data-point belongs to the set (via a mask). # then, retrieve the features and targets belonging to the set. Then create hdf5 file for these features. for s, split in enumerate(['train', 'val', 'test']): targets_assets_split = get_asset(self.folder, '{0}.json'.format(split)) is_in_split = [t in targets_assets_split for t in targets] features_split = features.loc[is_in_split] targets_split = targets.loc[is_in_split] assert targets_split.shape[0] == features_split.shape[0] unique_targets_split = np.sort(np.unique(targets_split)) if len(targets_assets_split) > unique_targets_split.shape[0]: print( f"unique set of labels ({(unique_targets_split.shape[0])}) is smaller than set of labels " f"given by assets ({len(targets_assets_split)}). Proceeding with unique set of labels." ) # write unique targets to json file. labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(unique_targets_split.tolist(), f) # normalize to zero mean and standard deviation 1 with stats from 'train' split only if split == 'train': mean, std = np.zeros(features.shape[1]), np.ones( features.shape[1]) if normalize: mean = np.mean(features_split, axis=0) std = np.std(features_split, axis=0) self._mean_std = {'mean': mean.tolist(), 'std': std.tolist()} mean_std_filename = os.path.join(self.root, self.filename_mean_std) with open(mean_std_filename, 'w') as f: json.dump(self._mean_std, f) mean_std = self.mean_std mean = np.array(mean_std['mean']) std = np.array(mean_std['std']) features_split = (features_split - mean) / (std + 1e-10) # write data (features and class labels) filename = os.path.join(self.root, self.filename.format(split)) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') for i, label in enumerate( tqdm(unique_targets_split, desc=filename)): data_class = features_split.loc[targets_split == label] group.create_dataset(label, data=data_class)
def download(self): import tarfile if self._check_integrity(): return chunkSize = 1024 r = requests.get(self.tar_url, stream=True) with open(self.root + '/fgvc-aircraft-2013b.tar.gz', 'wb') as f: pbar = tqdm(unit="B", total=int(r.headers['Content-Length'])) for chunk in r.iter_content(chunk_size=chunkSize): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) filename = os.path.join(self.root, 'fgvc-aircraft-2013b.tar.gz') with tarfile.open(filename, 'r') as f: f.extractall(self.root) # Cropping images with bounding box same as meta-dataset. bboxes_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data', 'images_box.txt') with open(bboxes_path, 'r') as f: names_to_bboxes = [ line.split('\n')[0].split(' ') for line in f.readlines() ] names_to_bboxes = dict( (name, map(int, (xmin, ymin, xmax, ymax))) for name, xmin, ymin, xmax, ymax in names_to_bboxes) # Retrieve mapping from filename to cls cls_trainval_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data', 'images_variant_trainval.txt') with open(cls_trainval_path, 'r') as f: filenames_to_clsnames = [ line.split('\n')[0].split(' ', 1) for line in f.readlines() ] cls_test_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data', 'images_variant_test.txt') with open(cls_test_path, 'r') as f: filenames_to_clsnames += [ line.split('\n')[0].split(' ', 1) for line in f.readlines() ] filenames_to_clsnames = dict(filenames_to_clsnames) clss_to_names = collections.defaultdict(list) for filename, cls in filenames_to_clsnames.items(): clss_to_names[cls].append(filename) for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename.format(split)) labels = get_asset(self.folder, '{}.json'.format(split)) labels_filename = os.path.join(self.root, self.filename_labels.format(split)) with open(labels_filename, 'w') as f: json.dump(labels, f) with h5py.File(filename, 'w') as f: group = f.create_group('datasets') for i, label in enumerate(tqdm(labels, desc=filename)): images = [] for file in clss_to_names[label]: file_path = os.path.join(self.root, 'fgvc-aircraft-2013b', 'data', 'images', '{}.jpg'.format(file)) img = Image.open(file_path) bbox = names_to_bboxes[file] img = np.asarray(img.crop(bbox).resize((32, 32)), dtype=np.uint8) images.append(img) dataset = group.create_dataset(label, (len(images), 32, 32, 3)) for j, image in enumerate(images): dataset[j] = image