def download_cub(): """ Download the birds dataset (CUB-200-2011) and text """ BIRDS_DATASET_URL = ( "http://www.vision.caltech.edu/visipedia-data/CUB-200-2011/CUB_200_2011.tgz" ) cub_download_location = "data/CUB_200_2011.tgz" cub_backup_location = "data/backup/CUB_200_2011.tgz" if os.path.exists(cub_backup_location): print("Retrieving CUB dataset from: {}".format(cub_backup_location)) shutil.copy(cub_backup_location, cub_download_location) else: print("Downloading CUB dataset from: {}".format(BIRDS_DATASET_URL)) cub_download_location = pathlib.Path("data/CUB_200_2011.tgz") urllib.request.urlretrieve(BIRDS_DATASET_URL, cub_download_location) mkdir("data/backup") shutil.copy(cub_download_location, cub_backup_location) tar = tarfile.open(cub_download_location, "r:gz") tar.extractall("data/CUB_200_2011_with_text/images/") tar.close() os.remove(cub_download_location) download_captions( GDRIVE_ID="0B3y_msrWZaXLT1BZdVdycDY5TEE", text_download_location="data/birds.zip", backup_location="data/backup/birds.zip", res_subdir="CUB_200_2011_with_text", )
def download_captions(GDRIVE_ID: str, text_download_location: str, backup_location: str, res_subdir: str): """ The Download and processing for the captions / text part of the dataset """ extracted_text_dir = text_download_location[:-4] if os.path.exists(backup_location): print("Retrieving dataset from: {}".format(backup_location)) shutil.copy(backup_location, text_download_location) with zipfile.ZipFile(backup_location, "r") as zipfd: zipfd.extractall("data/") else: print("Downloading text from Google Drive ID: {}".format(GDRIVE_ID)) gdd.download_file_from_google_drive(file_id=GDRIVE_ID, dest_path=text_download_location, unzip=True) mkdir("data/backup") shutil.copy(text_download_location, backup_location) # Move and clean up data if os.path.isdir(extracted_text_dir): os.rename(extracted_text_dir, f"data/{res_subdir}/text") else: raise Exception( "Expected to find directory {}, but it does not exist".format( extracted_text_dir)) os.remove(text_download_location)
def compare_generated_to_real( dataloader, num_images: int, noise_size: int, model: tf.keras.Model, save_location: str, img_size: int, subsequent_model: Optional[tf.keras.Model] = None, ): """ For a given number of images, generate the stackGAN stage 1 output by randomly sampling a dataloader. The generated images and the real original are saved side-by-side in the save_location. """ rmdir(save_location) mkdir(save_location) noise_list = [ np.random.normal(0, 1, (1, noise_size)).astype("float32") for idx in range(num_images) ] samples = sample_data(dataloader, num_samples=num_images, img_size=img_size) real_tensors, real_embeddings = zip(*samples) stage1_tensors = [ model.generator([embedding, noise], training=False)[0] for embedding, noise in zip(real_embeddings, noise_list) ] real_images = format_as_images(real_tensors, is_real=True) stage1_images = format_as_images(stage1_tensors, is_real=False) if subsequent_model is not None: stage2_tensors = [ subsequent_model.generator([generated_image, embedding], training=False)[0] for generated_image, embedding in zip(stage1_tensors, real_embeddings) ] stage2_images = format_as_images(stage2_tensors, is_real=False) for i, (real_image, stage1_image, stage2_image) in enumerate( zip(real_images, stage1_images, stage2_images)): image = concate_horizontallly(real_image, stage1_img=stage1_image, stage2_img=stage2_image) image.save(os.path.join(save_location, f"fake-vs-real-{i}.png")) else: for i, (real_image, stage1_image) in enumerate(zip(real_images, stage1_images)): image = concate_horizontallly(real_image, stage1_img=stage1_image) image.save(os.path.join(save_location, f"fake-vs-real-{i}.png"))
def check_for_xrays(directory: str): """ Check to see if the xray dataset has been downloaded at all. Raise an exception if it hasn't. If it has, move it to raw. """ train_location = os.path.join(directory, "train") valid_location = os.path.join(directory, "valid") raw_location = os.path.join(directory, "raw") if not os.path.isdir(train_location) or not os.path.isdir(valid_location): raise Exception("Please first download the CheXpert dataset") mkdir(raw_location) shutil.move(train_location, raw_location) shutil.move(valid_location, raw_location) shutil.move(f"{train_location}.csv", raw_location) shutil.move(f"{valid_location}.csv", raw_location)
def write_records_to_file(example_iterable: Iterable, subset_name: str, tfrecords_dir: str): """ Save the TFRecord dataset with each example in its own TFRecord file. Arguments: example_iterable: zip object (iterable) Each iteration yields a tuple of 4 objects subset_name: str Name of the subset (train/test) tfrecords_dir: str Directory in which the save the TFRecords """ for ( i, ( file_name, image_small, image_large, wrong_image_small, wrong_image_large, text_embedding, label, ), ) in enumerate(example_iterable): example = tf.train.Example(features=tf.train.Features( feature={ "image_small": _bytes_feature(image_small), "image_large": _bytes_feature(image_large), "wrong_image_small": _bytes_feature(wrong_image_small), "wrong_image_large": _bytes_feature(wrong_image_large), "name": _bytes_feature(file_name), "text": _bytes_feature(text_embedding), "label": _int64_feature(label), })) # Write a separate file to disk for each example mkdir(os.path.join(tfrecords_dir, subset_name)) record_path_name = os.path.join(tfrecords_dir, subset_name, "example-{}.tfrecord".format(i)) with tf.io.TFRecordWriter(record_path_name) as writer: serialised_example = example.SerializeToString() writer.write(serialised_example)
def __init__(self, root_path: str): self.root_path = root_path self.plot_dir = os.path.join(self.root_path, "plots") mkdir(self.plot_dir)