def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png'))
def _write_data_to_folder(zipfile, filenames, labels, folder, start_index, isTest): print("Writing data\n") sorted_labels = [None] * len(labels) if isTest == 1: for i in range(len(zipfile.infolist())): entry = zipfile.infolist()[i] if "IRHT_P_009793.tif" in entry.filename: zipfile.infolist().remove(entry) break zip_infolist = zipfile.infolist()[1:] for i in range(len(zip_infolist)): entry = zip_infolist[i] entry_index_infilenames = filenames.index( entry.filename[start_index:]) sorted_labels[i] = labels[entry_index_infilenames] for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], sorted_labels)): with zipfile.open(enrty) as file: img = Image.open(file) dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) img.save(os.path.join(dest, str(i) + '.png'), "PNG", quality=100)
def _write_data_to_folder(zipfile, labels, folder, isTrainingset): print("Writing data to folder\n") for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], labels)): with zipfile.open(enrty) as file: img = Image.open(file) dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) if isTrainingset == 1: img.save(os.path.join(dest, str(i) + '.png')) else: img.save(os.path.join(dest, str(i) + '.jpg'))
def _write_data_to_folder(data, labels, folder, classes): dest = os.path.join(folder, 'images') make_folder_if_not_exists(dest) for image, label in zip(data, labels): shutil.copy(image, dest) rows = np.column_stack( ([os.path.join('images', os.path.basename(item)) for item in data], labels)) rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0])) output_csv = pd.DataFrame(rows) output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False) return
def svhn(args): """ Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset torchvision.datasets.SVHN(root=args.output_folder, split='train', download=True) torchvision.datasets.SVHN(root=args.output_folder, split='test', download=True) # Load the data into memory train = _loadmat(os.path.join(args.output_folder, 'train_32x32.mat')) train_data, train_labels = train['X'], train['y'].astype( np.int64).squeeze() np.place(train_labels, train_labels == 10, 0) train_data = np.transpose(train_data, (3, 0, 1, 2)) test = _loadmat(os.path.join(args.output_folder, 'test_32x32.mat')) test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze() np.place(test_labels, test_labels == 10, 0) test_data = np.transpose(test_data, (3, 0, 1, 2)) # Make output folders dataset_root = os.path.join(args.output_folder, 'SVHN') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(args.output_folder, 'train_32x32.mat')) os.remove(os.path.join(args.output_folder, 'test_32x32.mat')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def cifar10(args): """ Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset cifar_train = torchvision.datasets.CIFAR10(root=args.output_folder, train=True, download=True) cifar_test = torchvision.datasets.CIFAR10(root=args.output_folder, train=False, download=True) # Load the data into memory train_data, train_labels = cifar_train.data, cifar_train.targets test_data, test_labels = cifar_test.data, cifar_test.targets # Make output folders dataset_root = os.path.join(args.output_folder, 'CIFAR10') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(args.output_folder, 'cifar-10-python.tar.gz')) shutil.rmtree(os.path.join(args.output_folder, 'cifar-10-batches-py')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def mnist(output_folder, **kwargs): """ Fetches and prepares (in a DeepDIVA friendly format) the MNIST dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Use torchvision to download the dataset torchvision.datasets.MNIST(root=output_folder, download=True) # Load the data into memory train_data, train_labels = torch.load( os.path.join(output_folder, 'MNIST', 'processed', 'training.pt')) test_data, test_labels = torch.load( os.path.join(output_folder, 'MNIST', 'processed', 'test.pt')) # Make output folders dataset_root = os.path.join(output_folder, 'MNIST') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) shutil.rmtree(os.path.join(output_folder, 'MNIST', 'raw')) shutil.rmtree(os.path.join(output_folder, 'MNIST', 'processed')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def cifar10(output_folder, **kwargs): """ Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified on the file system Parameters ---------- output_folder : str Path to folder where to put the dataset Returns ------- None """ # Make output folders dataset_root = os.path.join(output_folder, 'CIFAR10') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') if Path(dataset_root).exists(): print(f"Path ({dataset_root}) already exists. Nothing done") return make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) # Use torchvision to download the dataset cifar_train = torchvision.datasets.CIFAR10(root=output_folder, train=True, download=True) cifar_test = torchvision.datasets.CIFAR10(root=output_folder, train=False, download=True) # Load the data into memory train_data, train_labels = cifar_train.data, cifar_train.targets test_data, test_labels = cifar_test.data, cifar_test.targets # Replace numbers with text for class names class_names_mapping = { 0: 'plane', 1: 'car', 2: 'bird', 3: ' cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck' } train_labels = [class_names_mapping[l] for l in train_labels] test_labels = [class_names_mapping[l] for l in test_labels] def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img).save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) os.remove(os.path.join(output_folder, 'cifar-10-python.tar.gz')) shutil.rmtree(os.path.join(output_folder, 'cifar-10-batches-py')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
def glas(args): """ Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic segmentation to the location specified on the file system See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule Output folder structure: ../HisDB/GlaS/train ../HisDB/GlaS/val ../HisDB/GlaS/test ../HisDB/GlaS/test/data -> images ../HisDB/GlaS/test/gt -> pixel-wise annotated ground truth Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ def groupby_patient(list_to_group, index=3): """ split images by patient :param list_to_group: list of image names :param index: position of split by '-' in the image name to obtain patient ID :return: dictionary where keys are patient IDs and values are lists of images that are from that patient """ return { '-'.join(filename.split('-')[:index]): [ file for file in list_to_group if '-'.join(file.split('-')[:index]) == '-'.join( filename.split('-')[:index]) ] for filename in list_to_group } def convert_gt(img_path): img = np.array(pil_loader(img_path)) out_img = np.zeros(img.shape, dtype=np.uint8) out_img[:, :, 2] = 1 # set everything to background in blue channel out_img[:, :, 2][img[:, :, 2] != 0] = 2 # set glands to 2 in blue channel out = Image.fromarray(out_img) out.save(img_path) # make the root folder output_folder = args.output_folder dataset_root = os.path.join(output_folder, 'GlaS') make_folder_if_not_exists(dataset_root) # links to HisDB data sets link_tubules = urllib.parse.urlparse( 'https://warwick.ac.uk/fac/sci/dcs/research/tia/glascontest/download/warwick_qu_dataset_released_2016_07_08.zip' ) download_path_tubules = os.path.join( dataset_root, link_tubules.geturl().rsplit('/', 1)[-1]) # download files print('Downloading {}...'.format(link_tubules.geturl())) urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules) print('Download complete. Unpacking files...') # unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples zip_file = zipfile.ZipFile(download_path_tubules) zip_file.extractall(path=dataset_root) sets_dict = {} # 20 benign + 20 malignant images train_ids = ['train_'] # 4*5 sets_dict['train'] = train_ids # validation has 29 images val_ids = ['testA_'] # 2*4 + 1 = 9 sets_dict['val'] = val_ids # test has equal mal and ben and 16 img test_ids = ['testB_'] # 2*4 = 8 sets_dict['test'] = test_ids print('Splitting the dataset into train, val and test') img_file_path = os.path.join(dataset_root, "Warwick QU Dataset (Released 2016_07_08)") for s in ['train', 'test', 'val']: make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt')) make_folder_if_not_exists(os.path.join(dataset_root, s, 'data')) # print('CREATING {} SET'.format(s)) for pattern in sets_dict[s]: for img_file in os.listdir(img_file_path): if pattern in img_file: if 'anno' in img_file: # convert gt into correct data format convert_gt(os.path.join(img_file_path, img_file)) out_file = os.path.join('gt', img_file.replace('_anno', '')) else: out_file = os.path.join('data', img_file) shutil.move(os.path.join(img_file_path, img_file), os.path.join(dataset_root, s, out_file))
def miml(args): """ Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Download the files url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar' if not os.path.exists( os.path.join(args.output_folder, 'miml-image-data.rar')): print('Downloading file!') filename = wget.download(url, out=args.output_folder) else: print('File already downloaded!') filename = os.path.join(args.output_folder, 'miml-image-data.rar') # Extract the files path_to_rar = filename path_to_output = os.path.join(args.output_folder, 'tmp_miml') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'original.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'processed.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) print('Extracted files...') # Load the mat file mat = _loadmat(os.path.join(path_to_output, 'miml data.mat')) targets = mat['targets'].T classes = [item[0][0] for item in mat['class_name']] # Add filename at 0-index to correctly format the CSV headers classes.insert(0, 'filename') # Get list of all image files in the folder images = [ item for item in _get_all_files_in_folders_and_subfolders(path_to_output) if item.endswith('jpg') ] images = sorted(images, key=lambda e: int(os.path.basename(e).split('.')[0])) # Make splits train_data, test_data, train_labels, test_labels = _train_test_split( images, targets, test_size=0.2, random_state=42) train_data, val_data, train_labels, val_labels = _train_test_split( train_data, train_labels, test_size=0.2, random_state=42) # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data), # len(val_data), # len(test_data))) # Make output folders dataset_root = os.path.join(args.output_folder, 'MIML') train_folder = os.path.join(dataset_root, 'train') val_folder = os.path.join(dataset_root, 'val') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(val_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(data, labels, folder, classes): dest = os.path.join(folder, 'images') make_folder_if_not_exists(dest) for image, label in zip(data, labels): shutil.copy(image, dest) rows = np.column_stack( ([os.path.join('images', os.path.basename(item)) for item in data], labels)) rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0])) output_csv = pd.DataFrame(rows) output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False) return # Write the images to the correct folders print('Writing the data to the filesystem') _write_data_to_folder(train_data, train_labels, train_folder, classes) _write_data_to_folder(val_data, val_labels, val_folder, classes) _write_data_to_folder(test_data, test_labels, test_folder, classes) os.remove(filename) shutil.rmtree(path_to_output) print('All done!') return
def kmnist(args): """ Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified on the file system Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ def get_int(b): return int(codecs.encode(b, 'hex'), 16) def read_image_file(path): with open(path, 'rb') as f: data = f.read() assert get_int(data[:4]) == 2051 length = get_int(data[4:8]) num_rows = get_int(data[8:12]) num_cols = get_int(data[12:16]) images = [] parsed = np.frombuffer(data, dtype=np.uint8, offset=16) return torch.from_numpy(parsed).view(length, num_rows, num_cols) def read_label_file(path): with open(path, 'rb') as f: data = f.read() assert get_int(data[:4]) == 2049 length = get_int(data[4:8]) parsed = np.frombuffer(data, dtype=np.uint8, offset=8) return torch.from_numpy(parsed).view(length).long() try: torchvision.datasets.KMNIST(root=args.output_folder, download=True) except AttributeError: url_list = [ 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz' ] raw_folder = os.path.join(args.output_folder, 'raw') processed_folder = os.path.join(args.output_folder, 'processed') make_folder_if_not_exists(raw_folder) make_folder_if_not_exists(processed_folder) training_file = 'training.pt' test_file = 'test.pt' for url in url_list: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = (read_image_file( os.path.join(raw_folder, 'train-images-idx3-ubyte')), read_label_file( os.path.join(raw_folder, 'train-labels-idx1-ubyte'))) test_set = (read_image_file( os.path.join(raw_folder, 't10k-images-idx3-ubyte')), read_label_file( os.path.join(raw_folder, 't10k-labels-idx1-ubyte'))) with open(os.path.join(processed_folder, training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(processed_folder, test_file), 'wb') as f: torch.save(test_set, f) print('Done!') # Load the data into memory train_data, train_labels = torch.load( os.path.join(args.output_folder, 'processed', 'training.pt')) test_data, test_labels = torch.load( os.path.join(args.output_folder, 'processed', 'test.pt')) # Make output folders dataset_root = os.path.join(args.output_folder, 'KMNIST') train_folder = os.path.join(dataset_root, 'train') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(arr, labels, folder): for i, (img, label) in enumerate(zip(arr, labels)): dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) Image.fromarray(img.numpy(), mode='L').save(os.path.join(dest, str(i) + '.png')) # Write the images to the folders _write_data_to_folder(train_data, train_labels, train_folder) _write_data_to_folder(test_data, test_labels, test_folder) shutil.rmtree(os.path.join(args.output_folder, 'raw')) shutil.rmtree(os.path.join(args.output_folder, 'processed')) split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False) print("The KMNIST dataset is ready for you at {}".format(dataset_root))
def historical_wi(args): train_binarized_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-binarized.zip" train_colored_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-color.zip" test_binarized_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-binarized.zip?download=1" test_colored_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-color.zip?download=1" urls = [ train_binarized_url, train_colored_url, test_binarized_url, test_colored_url ] zip_name_train_binarized = "icdar17-historicalwi-training-binarized.zip" zip_name_train_color = "icdar17-historicalwi-training-color.zip" zip_name_test_binarized = "ScriptNet-HistoricalWI-2017-binarized.zip" zip_name_test_color = "ScriptNet-HistoricalWI-2017-color.zip" zip_names = [ zip_name_train_binarized, zip_name_train_color, zip_name_test_binarized, zip_name_test_color ] start_indices = [ len("icdar2017-training-binary/"), len("icdar2017-training-color/"), len("ScriptNet-HistoricalWI-2017-binarized/"), len("ScriptNet-HistoricalWI-2017-color/") ] # Make output folders """ dataset_root = os.path.join(args.output_folder) train_folder = os.path.join(dataset_root, 'train') train_binarized_folder = os.path.join(train_folder, 'Binarized') train_colored_folder = os.path.join(train_folder, 'Color') test_folder = os.path.join(dataset_root, 'test') test_binarized_folder = os.path.join(test_folder, 'Binarized') test_colored_folder = os.path.join(test_folder, 'Color') folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder] make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(train_binarized_folder) make_folder_if_not_exists(train_colored_folder) make_folder_if_not_exists(test_folder) make_folder_if_not_exists(test_binarized_folder) make_folder_if_not_exists(test_colored_folder) """ dataset_root = os.path.join( os.path.join(args.output_folder, 'historical_wi')) binarized_dataset = os.path.join(dataset_root, "BinarizedDataset") train_binarized_folder = os.path.join(binarized_dataset, 'train') test_binarized_folder = os.path.join(binarized_dataset, 'test') colored_dataset = os.path.join(dataset_root, "ColoredDataset") train_colored_folder = os.path.join(colored_dataset, 'train') test_colored_folder = os.path.join(colored_dataset, 'test') folders = [ train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder ] make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(binarized_dataset) make_folder_if_not_exists(colored_dataset) make_folder_if_not_exists(train_binarized_folder) make_folder_if_not_exists(train_colored_folder) make_folder_if_not_exists(test_binarized_folder) make_folder_if_not_exists(test_colored_folder) def _write_data_to_folder(zipfile, labels, folder, isTrainingset): print("Writing data to folder\n") for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], labels)): with zipfile.open(enrty) as file: img = Image.open(file) dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) if isTrainingset == 1: img.save(os.path.join(dest, str(i) + '.png')) else: img.save(os.path.join(dest, str(i) + '.jpg')) def _get_labels(zipfile, start_index): print("Extracting labels\n") labels = [] for zipinfo in zipfile.infolist()[1:]: file_name = zipinfo.filename ind = file_name.find("-", start_index) labels.append(file_name[start_index:ind]) return labels #Prepare Datasets for i in range(len(urls)): if i < 2: isTrainingset = 1 else: isTrainingset = 0 print("Downloading " + urls[i]) local_filename, headers = urllib.request.urlretrieve( urls[i], zip_names[i]) zfile = zipfile.ZipFile(local_filename) labels = _get_labels(zfile, start_indices[i]) _write_data_to_folder(zfile, labels, folders[i], isTrainingset) os.remove(os.path.join(zfile.filename)) if i == 0: print("Binary training data is ready!") elif i == 1: print("Colored training data is ready!") elif i == 2: print("Binary test data is ready!") else: print("Colored test data is ready!") split_dataset_writerIdentification(dataset_folder=dataset_root, split=0.2) print("Historical WI dataset is ready!")
def icdar2017_clamm(args): url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_Training.zip" print("Downloading " + url) zip_name = "ICDAR2017_CLaMM_Training.zip" local_filename, headers = urllib.request.urlretrieve(url, zip_name) zfile = zipfile.ZipFile(local_filename) # Make output folders dataset_root = os.path.join(args.output_folder, 'ICDAR2017-CLAMM') dataset_manuscriptDating = os.path.join(dataset_root, 'ManuscriptDating') dataset_md_train = os.path.join(dataset_manuscriptDating, 'train') dataset_styleClassification = os.path.join(dataset_root, 'StyleClassification') dataset_sc_train = os.path.join(dataset_styleClassification, 'train') test_sc_folder = os.path.join(dataset_styleClassification, 'test') test_md_folder = os.path.join(dataset_manuscriptDating, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(dataset_manuscriptDating) make_folder_if_not_exists(dataset_styleClassification) make_folder_if_not_exists(test_sc_folder) def _write_data_to_folder(zipfile, filenames, labels, folder, start_index, isTest): print("Writing data\n") sorted_labels = [None] * len(labels) if isTest == 1: for i in range(len(zipfile.infolist())): entry = zipfile.infolist()[i] if "IRHT_P_009793.tif" in entry.filename: zipfile.infolist().remove(entry) break zip_infolist = zipfile.infolist()[1:] for i in range(len(zip_infolist)): entry = zip_infolist[i] entry_index_infilenames = filenames.index( entry.filename[start_index:]) sorted_labels[i] = labels[entry_index_infilenames] for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:], sorted_labels)): with zipfile.open(enrty) as file: img = Image.open(file) dest = os.path.join(folder, str(label)) make_folder_if_not_exists(dest) img.save(os.path.join(dest, str(i) + '.png'), "PNG", quality=100) def getLabels(zfile): print("Extracting labels\n") filenames, md_labels, sc_labels = [], [], [] zip_infolist = zfile.infolist()[1:] for entry in zip_infolist: if '.csv' in entry.filename: with zfile.open(entry) as file: cf = file.read() c = csv.StringIO(cf.decode()) next( c ) # Skip the first line which is the header of csv file for row in c: md_label_strt_ind = row.rfind(';') md_label_end_ind = row.rfind("\r") md_labels.append(row[md_label_strt_ind + 1:md_label_end_ind]) sc_labels_strt_ind = row[:md_label_strt_ind].rfind(';') sc_labels.append(row[sc_labels_strt_ind + 1:md_label_strt_ind]) filename_ind = row[:sc_labels_strt_ind].rfind(';') if filename_ind > -1: f_name = row[filename_ind + 1:sc_labels_strt_ind] else: f_name = row[:sc_labels_strt_ind] if isTest == 1 and f_name == 'IRHT_P_009783.tif': print('No file named ' + f_name + ". This filename will not be added!") else: filenames.append(f_name) zfile.infolist().remove( entry) # remove the csv file from infolist if '.db' in entry.filename: # remove the db file from infolist zfile.infolist().remove(entry) return filenames, sc_labels, md_labels isTest = 0 filenames, sc_labels, md_labels = getLabels(zfile) start_index_training = len("ICDAR2017_CLaMM_Training/") print("Training data is being prepared for style classification!\n") _write_data_to_folder(zfile, filenames, sc_labels, dataset_sc_train, start_index_training, isTest) print("Training data is being prepared for manuscript dating!\n") _write_data_to_folder(zfile, filenames, md_labels, dataset_md_train, start_index_training, isTest) os.remove(os.path.join(zfile.filename)) url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_task1_task3.zip" print("Downloading " + url) zip_name_test = "ICDAR2017_CLaMM_task1_task3.zip" local_filename_test, headers_test = urllib.request.urlretrieve( url, zip_name_test) zfile_test = zipfile.ZipFile(local_filename_test) isTest = 1 filenames_test, sc_test_labels, md_test_labels = getLabels(zfile_test) start_index_test = len("ICDAR2017_CLaMM_task1_task3/") print("Test data is being prepared for style classification!\n") _write_data_to_folder(zfile_test, filenames_test, sc_test_labels, test_sc_folder, start_index_test, 1) print("Test data is being prepared for manuscript dating!\n") _write_data_to_folder(zfile_test, filenames_test, md_test_labels, test_md_folder, start_index_test, 1) os.remove(os.path.join(zfile_test.filename)) print("Training-Validation splitting\n") split_dataset(dataset_folder=dataset_manuscriptDating, split=0.2, symbolic=False) split_dataset(dataset_folder=dataset_styleClassification, split=0.2, symbolic=False) print("ICDAR2017 CLaMM data is ready!")
def diva_hisdb(args): """ Fetches and prepares (in a DeepDIVA friendly format) the DIVA HisDB-all dataset for semantic segmentation to the location specified on the file system See also: https://diuf.unifr.ch/main/hisdoc/diva-hisdb Output folder structure: ../HisDB/CB55/train ../HisDB/CB55/val ../HisDB/CB55/test ../HisDB/CB55/test/data -> images ../HisDB/CB55/test/gt -> pixel-wise annotated ground truth Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # make the root folder dataset_root = os.path.join(args.output_folder, 'HisDB') make_folder_if_not_exists(dataset_root) # links to HisDB data sets link_public = urllib.parse.urlparse( 'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/all.zip' ) link_test_private = urllib.parse.urlparse( 'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/private-test/all-privateTest.zip' ) download_path_public = os.path.join( dataset_root, link_public.geturl().rsplit('/', 1)[-1]) download_path_private = os.path.join( dataset_root, link_test_private.geturl().rsplit('/', 1)[-1]) # download files print('Downloading {}...'.format(link_public.geturl())) urllib.request.urlretrieve(link_public.geturl(), download_path_public) print('Downloading {}...'.format(link_test_private.geturl())) urllib.request.urlretrieve(link_test_private.geturl(), download_path_private) print('Download complete. Unpacking files...') # unpack relevant folders zip_file = zipfile.ZipFile(download_path_public) # unpack imgs and gt data_gt_zip = { f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file.namelist() if 'img' in f } dataset_folders = [ data_file.split('-')[-1][:-4] for data_file in data_gt_zip.keys() ] for data_file, gt_file in data_gt_zip.items(): dataset_name = data_file.split('-')[-1][:-4] dataset_folder = os.path.join(dataset_root, dataset_name) make_folder_if_not_exists(dataset_folder) for file in [data_file, gt_file]: zip_file.extract(file, dataset_folder) with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref: zip_ref.extractall(dataset_folder) # delete zips os.remove(os.path.join(dataset_folder, file)) # create folder structure for partition in ['train', 'val', 'test', 'test-public']: for folder in ['data', 'gt']: make_folder_if_not_exists( os.path.join(dataset_folder, partition, folder)) # move the files to the correct place for folder in dataset_folders: for k1, v1 in {'pixel-level-gt': 'gt', 'img': 'data'}.items(): for k2, v2 in { 'public-test': 'test-public', 'training': 'train', 'validation': 'val' }.items(): current_path = os.path.join(dataset_root, folder, k1, k2) new_path = os.path.join(dataset_root, folder, v2, v1) for f in [ f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f)) ]: shutil.move(os.path.join(current_path, f), os.path.join(new_path, f)) # remove old folders shutil.rmtree(os.path.join(dataset_root, folder, k1)) # fix naming issue for old, new in {'CS18': 'CSG18', 'CS863': 'CSG863'}.items(): os.rename(os.path.join(dataset_root, old), os.path.join(dataset_root, new)) # unpack private test folders zip_file_private = zipfile.ZipFile(download_path_private) data_gt_zip_private = { f: re.sub(r'img', 'pixel-level-gt', f) for f in zip_file_private.namelist() if 'img' in f } for data_file, gt_file in data_gt_zip_private.items(): dataset_name = re.search('-(.*)-', data_file).group(1) dataset_folder = os.path.join(dataset_root, dataset_name) for file in [data_file, gt_file]: zip_file_private.extract(file, dataset_folder) with zipfile.ZipFile(os.path.join(dataset_folder, file), "r") as zip_ref: zip_ref.extractall(os.path.join(dataset_folder, file[:-4])) # delete zip os.remove(os.path.join(dataset_folder, file)) # create folder structure for folder in ['data', 'gt']: make_folder_if_not_exists( os.path.join(dataset_folder, 'test', folder)) for old, new in {'pixel-level-gt': 'gt', 'img': 'data'}.items(): current_path = os.path.join( dataset_folder, "{}-{}-privateTest".format(old, dataset_name), dataset_name) new_path = os.path.join(dataset_folder, "test", new) for f in [ f for f in os.listdir(current_path) if os.path.isfile(os.path.join(current_path, f)) ]: # the ground truth files in the private test set have an additional ending, which needs to be remove if new == "gt": f_new = re.sub('_gt', r'', f) else: f_new = f shutil.move(os.path.join(current_path, f), os.path.join(new_path, f_new)) # remove old folders shutil.rmtree(os.path.dirname(current_path)) print('Finished. Data set up at {}.'.format(dataset_root))
def split_dataset_segmentation(dataset_folder, split, symbolic, test=False): """ Partition a dataset into train/val(/test) splits on the filesystem for segmentation datasets organized as dataset/data with the images and dataset/gt for the ground truth. The corresponding images need to have the same name. Parameters ---------- dataset_folder : str Path to the dataset folder (see datasets.image_folder_dataset.load_dataset for details). split : float Specifies how much of the training set should be converted into the validation set. symbolic : bool Does not make a copy of the data, but only symbolic links to the original data test: bool If true, the validation set is split again (1:1) into a val and test set. Default false. Returns ------- None """ # Getting the train dir orig_dir = os.path.join(dataset_folder, 'train') # Rename the original train dir shutil.move(orig_dir, os.path.join(dataset_folder, 'original_train')) orig_dir = os.path.join(dataset_folder, 'original_train') # Sanity check on the training folder if not os.path.isdir(orig_dir): print("Train folder not found in the args.dataset_folder={}".format( dataset_folder)) sys.exit(-1) # get the dataset splits path_data = os.path.join(orig_dir, "data") path_gt = os.path.join(orig_dir, "gt") file_names_data = sorted([ f for f in os.listdir(path_data) if os.path.isfile(os.path.join(path_data, f)) ]) file_names_gt = sorted([ f for f in os.listdir(path_gt) if os.path.isfile(os.path.join(path_gt, f)) ]) # Check data and ensure everything is cool assert len(file_names_data) == len(file_names_gt) for data, gt in zip(file_names_data, file_names_gt): assert data[: -3] == gt[: -3] # exclude the extension which should be jpg and png assert gt[-3:] == "png" # Split the data into two sets file_names = [(data, gt) for data, gt in zip(file_names_data, file_names_gt)] filenames_train, filenames_val, _, _ = train_test_split(file_names, file_names, test_size=split, random_state=42) if test: # Split the data into two sets filenames_val, filenames_test, _, _ = train_test_split(filenames_val, filenames_val, test_size=0.5, random_state=42) # Make output folders dataset_root = os.path.join(dataset_folder) train_folder = os.path.join(dataset_root, 'train') val_folder = os.path.join(dataset_root, 'val') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(val_folder) if test: test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(test_folder) folders = [train_folder, val_folder, test_folder ] if test else [train_folder, val_folder] file_splits = [filenames_train, filenames_val, filenames_test ] if test else [filenames_train, filenames_val] # Copying the splits into their folders for folder, split_files in zip(folders, file_splits): make_folder_if_not_exists(os.path.join(folder, 'data')) make_folder_if_not_exists(os.path.join(folder, 'gt')) for fdata, fgt in split_files: if symbolic: os.symlink(os.path.join(path_data, fdata), os.path.join(folder, 'data', fdata)) os.symlink(os.path.join(path_gt, fgt), os.path.join(folder, 'gt', fgt)) else: shutil.copy(os.path.join(path_data, fdata), os.path.join(folder, 'data', fdata)) shutil.copy(os.path.join(path_gt, fgt), os.path.join(folder, 'gt', fgt)) return
def glas(args): """ Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic segmentation to the location specified on the file system See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule Output folder structure: ../HisDB/GlaS/train ../HisDB/GlaS/val ../HisDB/GlaS/test ../HisDB/GlaS/test/data -> images ../HisDB/GlaS/test/gt -> pixel-wise annotated ground truth Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ def groupby_patient(list_to_group, index=3): """ split images by patient :param list_to_group: list of image names :param index: position of split by '-' in the image name to obtain patient ID :return: dictionary where keys are patient IDs and values are lists of images that are from that patient """ return { '-'.join(filename.split('-')[:index]): [ file for file in list_to_group if '-'.join(file.split('-')[:index]) == '-'.join( filename.split('-')[:index]) ] for filename in list_to_group } def convert_gt(img_path): img = pil_loader(img_path) out_img = np.zeros((*img.shape, 3), dtype=np.uint8) out_img[:, :, 2] = 1 # set everything to background in blue channel out_img[:, :, 2][img != 0] = 2 # set glands to 2 in blue channel out = Image.fromarray(out_img) out.save(img_path) # make the root folder output_folder = args.output_folder dataset_root = os.path.join(output_folder, 'GlaS') make_folder_if_not_exists(dataset_root) # links to HisDB data sets link_tubules = urllib.parse.urlparse( 'http://andrewjanowczyk.com/wp-static/tubule.tgz') download_path_tubules = os.path.join( dataset_root, link_tubules.geturl().rsplit('/', 1)[-1]) # download files print('Downloading {}...'.format(link_tubules.geturl())) urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules) print('Download complete. Unpacking files...') # unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples tar_file = tarfile.open(download_path_tubules) tar_file.extractall(path=dataset_root) sets_dict = {} # 20 benign + 20 malignant images train_ids_b = [ '09-1339-01', '09-16566-03', '09-21631-03', '09-23232-02', 'm9_10741F-12T2N0', '10-13799-05' ] # 4*5 train_ids_m = [ '09-322-02', '09-16566-02', '10-13799-06', '10-15247-02', 'm6_10719 T3N2a', 'm17_1421 IE-11 T3N2a', 'm18_1421 IE-11 1-86', 'm39_10-1273' ] # 5*4 sets_dict['train'] = train_ids_b + train_ids_m # validation has 29 images val_ids_b = ['10-12813-05', '10-13799-02', 'm2_10449-11E-T3N1b'] # 2*4 + 1 = 9 val_ids_m = [ '09-1339-02', '09-1339-05', '09-1646-01', '09-1646-02', '09-23757-01' ] # 5*4 = 20 sets_dict['val'] = val_ids_b + val_ids_m # test has equal mal and ben and 16 img test_ids_m = ['09-1646-03', '09-1646-05'] # 2*4 = 8 test_ids_b = ['10-12813-01', '10-13799-01'] # 2*4 = 8 sets_dict['test'] = test_ids_b + test_ids_m print('Splitting the dataset into train, val and test') for s in ['train', 'test', 'val']: make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt')) make_folder_if_not_exists(os.path.join(dataset_root, s, 'data')) print('CREATING {} SET'.format(s)) for patient in sets_dict[s]: for img_file in os.listdir(dataset_root): if patient in img_file: if 'anno' in img_file: # convert gt into correct data format convert_gt(os.path.join(dataset_root, img_file)) out_file = os.path.join('gt', img_file.replace('_anno', '')) else: out_file = os.path.join('data', img_file) shutil.move(os.path.join(dataset_root, img_file), os.path.join(dataset_root, s, out_file))