def svhn(args): """ Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset torchvision.datasets.SVHN(root=args.output_folder, split='train', download=True) torchvision.datasets.SVHN(root=args.output_folder, split='test', download=True) # Load the data into memory train = scipy.io.loadmat( os.path.join(args.output_folder, 'train_32x32.mat')) train_data, train_labels = train['X'], train['y'].astype( np.int64).squeeze() np.place(train_labels, train_labels == 10, 0) train_data = np.transpose(train_data, (3, 0, 1, 2)) test = scipy.io.loadmat(os.path.join(args.output_folder, 'test_32x32.mat')) test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze() np.place(test_labels, test_labels == 10, 0) test_data = np.transpose(test_data, (3, 0, 1, 2)) # Make output folders dataset_root = os.path.join(args.output_folder, 'SVHN') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') _make_folder_if_not_exists(dataset_root) _make_folder_if_not_exists(train_folder) _make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) _make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(args.output_folder, 'train_32x32.mat')) os.remove(os.path.join(args.output_folder, 'test_32x32.mat')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def kmnist(args): """ Fetches and prepares (in a DeepDIVA friendly format) the Kuzushiji-MNIST dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ """ Fetches and prepares (in a DeepDIVA friendly format) the Fashion-MNIST dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset torchvision.datasets.KMNIST(root=args.output_folder, download=True) # Load the data into memory train_data, train_labels = torch.load( os.path.join(args.output_folder, 'KMNIST', 'processed', 'training.pt')) test_data, test_labels = torch.load( os.path.join(args.output_folder, 'KMNIST', 'processed', 'test.pt')) # Make output folders dataset_root = os.path.join(args.output_folder, 'KMNIST') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') _make_folder_if_not_exists(dataset_root) _make_folder_if_not_exists(train_folder) _make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) _make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) shutil.rmtree(os.path.join(args.output_folder, 'KMNIST', 'raw')) shutil.rmtree(os.path.join(args.output_folder, 'KMNIST', 'processed')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def cifar10(args): """ Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset cifar_train = torchvision.datasets.CIFAR10(root=args.output_folder, train=True, download=True) cifar_test = torchvision.datasets.CIFAR10(root=args.output_folder, train=False, download=True) # Load the data into memory train_data, train_labels = cifar_train.train_data, cifar_train.train_labels test_data, test_labels = cifar_test.test_data, cifar_test.test_labels # Make output folders dataset_root = os.path.join(args.output_folder, 'CIFAR10') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') _make_folder_if_not_exists(dataset_root) _make_folder_if_not_exists(train_folder) _make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) _make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(args.output_folder, 'cifar-10-python.tar.gz')) shutil.rmtree(os.path.join(args.output_folder, 'cifar-10-batches-py')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def mnist(args): # Use torchvision to download the dataset torchvision.datasets.MNIST(root=args.output_folder, download=True) # Load the data into memory train_data, train_labels = torch.load( os.path.join(args.output_folder, 'processed', 'training.pt')) test_data, test_labels = torch.load( os.path.join(args.output_folder, 'processed', 'test.pt')) # Make output folders dataset_root = os.path.join(args.output_folder, 'MNIST') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') _make_folder_if_not_exists(dataset_root) _make_folder_if_not_exists(train_folder) _make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) _make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) shutil.rmtree(os.path.join(args.output_folder, 'raw')) shutil.rmtree(os.path.join(args.output_folder, 'processed')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False) return
def kmnist(args): """ Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ def get_int(b): return int(codecs.encode(b, 'hex'), 16) def read_image_file(path): with open(path, 'rb') as f: data = f.read() assert get_int(data[:4]) == 2051 length = get_int(data[4:8]) num_rows = get_int(data[8:12]) num_cols = get_int(data[12:16]) images = [] parsed = np.frombuffer(data, dtype=np.uint8, offset=16) return torch.from_numpy(parsed).view(length, num_rows, num_cols) def read_label_file(path): with open(path, 'rb') as f: data = f.read() assert get_int(data[:4]) == 2049 length = get_int(data[4:8]) parsed = np.frombuffer(data, dtype=np.uint8, offset=8) return torch.from_numpy(parsed).view(length).long() try: torchvision.datasets.KMNIST(root=args.output_folder, download=True) except AttributeError: url_list = [ 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz' ] raw_folder = os.path.join(args.output_folder, 'raw') processed_folder = os.path.join(args.output_folder, 'processed') make_folder_if_not_exists(raw_folder) make_folder_if_not_exists(processed_folder) training_file = 'training.pt' test_file = 'test.pt' for url in url_list: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = (read_image_file( os.path.join(raw_folder, 'train-images-idx3-ubyte')), read_label_file( os.path.join(raw_folder, 'train-labels-idx1-ubyte'))) test_set = (read_image_file( os.path.join(raw_folder, 't10k-images-idx3-ubyte')), read_label_file( os.path.join(raw_folder, 't10k-labels-idx1-ubyte'))) with open(os.path.join(processed_folder, training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(processed_folder, test_file), 'wb') as f: torch.save(test_set, f) print('Done!') # Load the data into memory train_data, train_labels = torch.load( os.path.join(args.output_folder, 'processed', 'training.pt')) test_data, test_labels = torch.load( os.path.join(args.output_folder, 'processed', 'test.pt')) # Make output folders dataset_root = os.path.join(args.output_folder, 'KMNIST') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) shutil.rmtree(os.path.join(args.output_folder, 'raw')) shutil.rmtree(os.path.join(args.output_folder, 'processed')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False) print("The KMNIST dataset is ready for you at {}".format(dataset_root))
def icdar2017_clamm(args): url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_Training.zip" print("Downloading " + url) zip_name = "ICDAR2017_CLaMM_Training.zip" local_filename, headers = urllib.request.urlretrieve(url, zip_name) zfile = zipfile.ZipFile(local_filename) # Make output folders dataset_root = os.path.join(args.output_folder, 'ICDAR2017-CLAMM') dataset_manuscriptDating = os.path.join(dataset_root, 'ManuscriptDating') dataset_md_train = os.path.join(dataset_manuscriptDating, 'train') dataset_styleClassification = os.path.join(dataset_root, 'StyleClassification') dataset_sc_train = os.path.join(dataset_styleClassification, 'train') test_sc_folder = os.path.join(dataset_styleClassification, 'test') test_md_folder = os.path.join(dataset_manuscriptDating, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(dataset_manuscriptDating) make_folder_if_not_exists(dataset_styleClassification) make_folder_if_not_exists(test_sc_folder) def _write_data_to_folder(zipfile, filenames, labels, folder, start_index, isTest): print("Writing data\n") sorted_labels = [None] * len(labels) if isTest == 1: for i in range(len(zipfile.infolist())): entry = zipfile.infolist()[i] if "IRHT_P_009793.tif" in entry.filename: zipfile.infolist().remove(entry) break zip_infolist = zipfile.infolist()[1:] for i in range(len(zip_infolist)): entry = zip_infolist[i] entry_index_infilenames = filenames.index( entry.filename[start_index:]) sorted_labels[i] = labels[entry_index_infilenames] for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], sorted_labels)): with zipfile.open(enrty) as file: img = Image.open(file) dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) img.save(os.path.join(dest, str(i) + '.png'), "PNG", quality=100) def getLabels(zfile): print("Extracting labels\n") filenames, md_labels, sc_labels = [], [], [] zip_infolist = zfile.infolist()[1:] for entry in zip_infolist: if '.csv' in entry.filename: with zfile.open(entry) as file: cf = file.read() c = csv.StringIO(cf.decode()) next( c ) # Skip the first line which is the header of csv file for row in c: md_label_strt_ind = row.rfind(';') md_label_end_ind = row.rfind("\r") md_labels.append(row[md_label_strt_ind + 1:md_label_end_ind]) sc_labels_strt_ind = row[:md_label_strt_ind].rfind(';') sc_labels.append(row[sc_labels_strt_ind + 1:md_label_strt_ind]) filename_ind = row[:sc_labels_strt_ind].rfind(';') if filename_ind > -1: f_name = row[filename_ind + 1:sc_labels_strt_ind] else: f_name = row[:sc_labels_strt_ind] if isTest == 1 and f_name == 'IRHT_P_009783.tif': print('No file named ' + f_name + ". This filename will not be added!") else: filenames.append(f_name) zfile.infolist().remove( entry) # remove the csv file from infolist if '.db' in entry.filename: # remove the db file from infolist zfile.infolist().remove(entry) return filenames, sc_labels, md_labels isTest = 0 filenames, sc_labels, md_labels = getLabels(zfile) start_index_training = len("ICDAR2017_CLaMM_Training/") print("Training data is being prepared for style classification!\n") _write_data_to_folder(zfile, filenames, sc_labels, dataset_sc_train, start_index_training, isTest) print("Training data is being prepared for manuscript dating!\n") _write_data_to_folder(zfile, filenames, md_labels, dataset_md_train, start_index_training, isTest) os.remove(os.path.join(zfile.filename)) url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_task1_task3.zip" print("Downloading " + url) zip_name_test = "ICDAR2017_CLaMM_task1_task3.zip" local_filename_test, headers_test = urllib.request.urlretrieve( url, zip_name_test) zfile_test = zipfile.ZipFile(local_filename_test) isTest = 1 filenames_test, sc_test_labels, md_test_labels = getLabels(zfile_test) start_index_test = len("ICDAR2017_CLaMM_task1_task3/") print("Test data is being prepared for style classification!\n") _write_data_to_folder(zfile_test, filenames_test, sc_test_labels, test_sc_folder, start_index_test, 1) print("Test data is being prepared for manuscript dating!\n") _write_data_to_folder(zfile_test, filenames_test, md_test_labels, test_md_folder, start_index_test, 1) os.remove(os.path.join(zfile_test.filename)) print("Training-Validation splitting\n") split_dataset(dataset_folder=dataset_manuscriptDating, split=0.2, symbolic=False) split_dataset(dataset_folder=dataset_styleClassification, split=0.2, symbolic=False) print("ICDAR2017 CLaMM data is ready!")