def build_tfrecord(dataset_root_dir, tfrecord_save_path): """build the TF record files. Args: dataset_root_dir: The dataset directory where the dataset is stored. tfrecord_save_path: The directory to save the tfrecord files """ if not tf.gfile.Exists(tfrecord_save_path): tf.gfile.MakeDirs(tfrecord_save_path) photo_filenames, class_names = _get_filenames_and_classes(dataset_root_dir) # Shuffle and divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) valid_num = int((1 - _TRAIN_RATIO) * len(photo_filenames)) training_filenames = photo_filenames[valid_num:] validation_filenames = photo_filenames[:valid_num] class_names_to_ids = dict(zip(class_names, range(len(class_names)))) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids) _convert_dataset('validation', validation_filenames, class_names_to_ids) # Finally, write the labels file: dataset_utils.write_label_file(labels_to_class_names, tfrecord_save_path) print('\nFinished converting the dataset!')
def run(dataset_dir): if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'val') classes_train = sorted( list( filter(lambda x: os.path.isdir(join(dataset_dir, 'train', x)), os.listdir(join(dataset_dir, 'train'))))) classes_map = {} for idx, cls_train in enumerate(classes_train): classes_map[cls_train] = idx with contextlib2.ExitStack() as tf_record_close_stack: train_writer = dataset_utils.open_sharded_output_tfrecords( tf_record_close_stack, training_filename, _NUM_TRAIN_FILES) _create_tfrecord_train(dataset_dir, train_writer, classes_map) with contextlib2.ExitStack() as tf_record_close_stack: test_writer = dataset_utils.open_sharded_output_tfrecords( tf_record_close_stack, testing_filename, _NUM_TRAIN_FILES) _create_tfrecord_test(dataset_dir, test_writer, classes_map) labels_to_class_names = dict(zip(range(len(classes_train)), classes_train)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
def run(dataset_dir): if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nFinished converting the Flowers dataset!')
def convertToTfrecord (self, nValidations = None): if not nValidations: nValidations = self.nValidations self.nValidations = nValidations #convert to Tfrecords photo_filenames, class_names = self._get_filenames_and_classes() class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(self.randomSeed) random.shuffle(photo_filenames) training_filenames = photo_filenames[nValidations:] validation_filenames = photo_filenames[:nValidations] self.nTrains = len(photo_filenames) - nValidations # First, convert the training and validation sets. self._convert_dataset('train', training_filenames, class_names_to_ids) self._convert_dataset('validation', validation_filenames, class_names_to_ids) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, self.datasetDir) self.labels_to_names = labels_to_class_names self.nClasses = len(labels_to_class_names) print('\nFinished converting the Matify dataset!')
def make_tfrecord(dataset_name, dataset_dir, train_fraction=0.9, num_channels=3, num_shards=4): if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_name, dataset_dir, num_shards): print('Dataset files already exist. Exiting without re-creating them.') return None, None random.seed(_RANDOM_SEED) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) # Divide into train and test: print("Now let's start converting the Koreans dataset!") random.shuffle(photo_filenames) num_train = int(len(photo_filenames) * train_fraction) num_validation = int(len(photo_filenames) * (1 - train_fraction)) num_dataset = len(photo_filenames) training_filenames = photo_filenames[:num_train] validation_filenames = photo_filenames[num_train:] class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # First, convert the training and validation sets. _convert_dataset(dataset_name, 'train', training_filenames, class_names_to_ids, dataset_dir, num_shards, num_channels) _convert_dataset(dataset_name, 'validation', validation_filenames, class_names_to_ids, dataset_dir, num_shards, num_channels) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) json.dump({"num_train": num_train, "num_validation": num_validation, "num_classes": len(class_names)}, open(os.path.join(dataset_dir, "metadata"), mode="w+")) return num_dataset, len(class_names)
def run(): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(_OUTPUT_PATH): tf.gfile.MakeDirs(_OUTPUT_PATH) if _dataset_exists(_OUTPUT_PATH): print('Dataset files already exist. Exiting without re-creating them.') return # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(_INPUT_PATH) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids) _convert_dataset('validation', validation_filenames, class_names_to_ids) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, _OUTPUT_PATH) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Parameters ---------- dataset_dir : str The directory where the temporary files are stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = os.path.join(dataset_dir, 'train-{:05d}-of-{:05d}') testing_filename = os.path.join(dataset_dir, 'test-{:05d}-of-{:05d}') _download_and_uncompress_dataset(dataset_dir) # First, process the training data: filenames = [os.path.join(dataset_dir, 'cifar-10-batches-py', 'data_batch_{}'.format(i + 1)) for i in range(_NUM_TRAIN_FILES)] _build_shards(filenames, training_filename) # Next, process the testing data: filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch') _build_shards([filename], testing_filename) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the CIFAR10 dataset!')
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names, test_set = permutate(FLAGS.dataset_dir, 10000, 400) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need #num_validation = int(FLAGS.validation_size * len(photo_filenames)) #Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) random.shuffle(test_set) #training_filenames = photo_filenames[num_validation:] #validation_filenames = photo_filenames[:num_validation] training_filenames = photo_filenames validation_filenames = test_set # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def run(dataset_dir, output_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. output_dir: 输出目录 """ photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # 通过随机数打乱图片的列表顺序 # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, output_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, output_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, output_dir) print('\nFinished converting the dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return dataset_dir_depth = "tmp/scenes/sc0013/depth" dataset_dir_rgb = "tmp/scenes/sc0013/rgb" photo_filenames_rgb, class_names_rgb = _get_filenames_and_classes( dataset_dir_rgb) photo_filenames_depth, class_names_depth = _get_filenames_and_classes( dataset_dir_depth) class_names_to_ids_rgb = dict( zip(class_names_rgb, range(len(class_names_rgb)))) class_names_to_ids_depth = dict( zip(class_names_depth, range(len(class_names_depth)))) # Divide into train and test: random.seed(_RANDOM_SEED) photo_filenames_rgb, photo_filenames_depth = shuffle( photo_filenames_rgb, photo_filenames_depth, random_state=_RANDOM_SEED) training_filenames_rgb = photo_filenames_rgb[_NUM_VALIDATION:] validation_filenames_rgb = photo_filenames_rgb[:_NUM_VALIDATION] training_filenames_depth = photo_filenames_depth[_NUM_VALIDATION:] validation_filenames_depth = photo_filenames_depth[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames_rgb, class_names_to_ids_rgb, dataset_dir_rgb) # _convert_dataset('validation', validation_filenames_rgb, class_names_to_ids_rgb, # dataset_dir_rgb) _convert_dataset('train', training_filenames_depth, class_names_to_ids_depth, dataset_dir_depth) # _convert_dataset('validation', validation_filenames_depth, class_names_to_ids_depth, # dataset_dir_depth) # Finally, write the labels file: labels_to_class_names_rgb = dict( zip(range(len(class_names_rgb)), class_names_rgb)) dataset_utils.write_label_file(labels_to_class_names_rgb, dataset_dir_rgb) labels_to_class_names_depth = dict( zip(range(len(class_names_depth)), class_names_depth)) dataset_utils.write_label_file(labels_to_class_names_depth, dataset_dir_depth) print('\nFinished converting the Objects dataset!')
def main(): #==============================================================CHECKS========================================================================== # Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') # Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.') # Check if there is a dataset directory entered if not FLAGS.tfrecord_dir: raise ValueError('tfrecord_dir is empty. Please state a tfrecord_dir argument.') if not os.path.exists(FLAGS.tfrecord_dir): os.makedirs(FLAGS.tfrecord_dir) # If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists( dataset_dir = FLAGS.tfrecord_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== # Get a list of photo_filenames and a list of sorted class names from parsing the subdirectories. photo_filenames, class_ids = _get_filenames_and_classes(FLAGS.dataset_dir) # Refer each of the class name to a specific integer number for predictions later class_ids_to_serial = dict(zip(class_ids, range(len(class_ids)))) # Write the labels file: serial_to_class_ids = dict(zip(range(len(class_ids)), class_ids)) write_label_file(serial_to_class_ids, FLAGS.dataset_dir) # Find the number of validation examples we need num_validation = int(float(FLAGS.validation_size) * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Convert the training and validation sets. _convert_dataset('train', training_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) _convert_dataset('validation', validation_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo__train_filenames, photo__val_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) training_filenames = photo__train_filenames validation_filenames = photo__val_filenames # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(argv): if not FLAGS.tfrecord_filename: raise ValueError( "tfrecord_filename is empty. Please state a tfrecord_filename argument." ) if not FLAGS.dataset_dir: raise ValueError( "dataset_dir is empty. Please state a dataset_dir argument.") if _dataset_exists( dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename, ): return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset( "train", training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) _convert_dataset( "validation", validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir)
def main(): if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) write_data_summary(num_validation, len(photo_filenames), FLAGS.dataset_dir)
def run(dataset_dir, train_file, val_file): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) # First, convert the training and validation sets. _convert_dataset('train', train_file, dataset_dir) _convert_dataset('validation', val_file, dataset_dir) # Finally, write the labels file: labels_to_class_names = {0: 'background', 1: 'foreground'} dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nFinished converting the cityscapes dataset!')
def run(dataset_dir): training_filenames, class_names = _get_filenames_and_classes(os.path.join(dataset_dir, 'train')) validation_filenames, class_names = _get_filenames_and_classes(os.path.join(dataset_dir, 'validation')) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) random.seed(0) random.shuffle(training_filenames) random.shuffle(validation_filenames) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
def build_tfrecord(dataset_root_dir, tfrecord_save_path): """build the TF record files. Args: dataset_root_dir: The dataset directory where the dataset is stored. tfrecord_save_path: The directory to save the tfrecord files """ print('\nStart...') if not tf.gfile.Exists(tfrecord_save_path): tf.gfile.MakeDirs(tfrecord_save_path) else: print("tfrecord_save_path has exist, please check!") print("stop") return print('\nloading all images\' filename list...') photo_filenames, class_names = _get_filenames_and_classes(dataset_root_dir) # Shuffle and divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) train_ratio = _TRAIN_NUM_SHARDS / (_TRAIN_NUM_SHARDS + _VALID_NUM_SHARDS) valid_num = int((1 - train_ratio) * len(photo_filenames)) training_filenames = photo_filenames[valid_num:] validation_filenames = photo_filenames[:valid_num] print('the total size of training dataset: %d' % len(training_filenames)) print('the total size of validation dataset: %d' % len(validation_filenames)) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # First, convert the training and validation sets. print('\nStart converting the training dataset...') _convert_dataset('train', training_filenames, class_names_to_ids) print('\nStart converting the validation dataset...') _convert_dataset('validation', validation_filenames, class_names_to_ids) # Finally, write the labels file: dataset_utils.write_label_file(labels_to_class_names, tfrecord_save_path) print('\nFinished converting the dataset!')
def buildTfRecordFile(input_images_path, out_path): file_directory = input_images_path dataset_dir = file_directory if not os.path.exists(dataset_dir): print('The directory for dataset (i.e., "' + dataset_dir + '") does not exist.') exit() output_dir = out_path if output_dir[-1] != "/": output_dir += "/" if not os.path.exists(output_dir): os.makedirs(output_dir) #STORK sets the validation percentage elsewhere in the code and #reccomends to set to zero in the convertion validation_percentage = 0 #(float(0) / 100.0) if not os.path.exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) image_filenames, class_names = _get_filenames_and_classes(dataset_dir) _NUM_VALIDATION = int(len(image_filenames) * validation_percentage) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(image_filenames) training_filenames = image_filenames[_NUM_VALIDATION:] validation_filenames = image_filenames[:_NUM_VALIDATION] #First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir, output_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir, output_dir) #Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, output_dir) print('\nFinished converting dataset!') print('The converted data is stored in the directory: "' + output_dir + '"')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) if len(photo_filenames) == 0: print(" no files detected") exit(-1) zz = zip(class_names, range(len(class_names))) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) if os.path.exists(OUTPUT_PATH) is True: shutil.rmtree(OUTPUT_PATH) if os.path.exists(OUTPUT_PATH) is False: os.makedirs(OUTPUT_PATH) dataset_utils.write_label_file(labels_to_class_names, OUTPUT_PATH) print("all files:%d classes: %d " % (len(photo_filenames), len(class_names))) _NUM_VALIDATION = math.ceil(len(photo_filenames)/10) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] num_shards = math.ceil(len(training_filenames) / 1000) g_process.set_all_step(num_shards) _convert_dataset("train", training_filenames, class_names_to_ids, dataset_dir, num_shards) num_shards = math.ceil(len(validation_filenames) / 1000) g_process.reset() g_process.set_all_step(num_shards) _convert_dataset("validation", validation_filenames, class_names_to_ids, dataset_dir, num_shards)
def main(_): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ dataset_dir = FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: offset = 0 for i in range(_NUM_TRAIN_FILES): filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'data_batch_%d' % (i + 1)) # 1-indexed. offset = _add_to_tfrecord(filename, tfrecord_writer, offset) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch') _add_to_tfrecord(filename, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Cifar10 dataset!')
def run(args): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ dataset_dir = FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists( testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return # _download_dataset(dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the MNIST dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ # if not tf.gfile.Exists(dataset_dir): # tf.gfile.MakeDirs(dataset_dir) # if _dataset_exists(dataset_dir): # print('Dataset files already exist. Exiting without re-creating them.') # return # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] d = {'burgers': 0, 'notburgers': 0} for fn in training_filenames: if 'all/burgers' in fn: d['burgers'] += 1 else: d['notburgers'] += 1 print(d) validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
def run(argv): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ dataset_dir = FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return _download_dataset(dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the MNIST dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return # # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) # First, process the training data: # with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: # offset = 0 # # filename = 'D:/pig_recognize/pig_slim1/pig_data_all' # 1-indexed. # offset = _add_to_tfrecord(filename, tfrecord_writer, offset) #''' # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = 'D:/pig_recognize/pig_slim1/pig_test_b_body_face' _add_to_tfrecord(filename, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Cifar10 dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, landmarks = _get_filenames_and_landmarks(dataset_dir) filenames_to_landmarks = dict(zip(photo_filenames, landmarks)) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, filenames_to_landmarks, dataset_dir) _convert_dataset('validation', validation_filenames, filenames_to_landmarks, dataset_dir) # Finally, write the labels file: landmark_names = ['left_eye', 'right_eye', 'nose'] labels_landmark_names = dict( list(zip(list(range(len(landmark_names))), landmark_names))) dataset_utils.write_label_file(labels_landmark_names, dataset_dir) print('\nFinished converting the dataset!')
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) print('{}; {}'.format(len(photo_filenames), len(class_names))) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Save validation images list to CSV file with open("./data/validate.csv", 'w') as f: f.write('IMAGE_NAME\n') for file in validation_filenames: head, filename = os.path.split(file) class_name = os.path.basename(os.path.dirname(file)) f.write(str(filename) + ',' + str(class_name) + '\n') # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def run(dataset_dir, max_class_size=1000): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir + "dataset_test/"): tf.gfile.MakeDirs(dataset_dir + "dataset_test/") if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return ########################## # ISIC # ########################## # Shuffle the data filepaths_isic, fileclasses_isic = _get_filenames_and_classes_isic( dataset_dir) assert (len(filepaths_isic) == len(fileclasses_isic)) random.seed(_RANDOM_SEED) random.shuffle(filepaths_isic) random.seed(_RANDOM_SEED) random.shuffle(fileclasses_isic) ########################## # TEST ISIC # ########################## # Contruct the test set test_filepaths_isic = [] test_fileclasses_isic = [] index_isic_ = [] count_isic = 0 count_isic_ = {} count_isic_["pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] = 0 count_isic_["pigmented-lesions-malignant/melanoma/dermoscopy/"] = 0 for j, i in enumerate(filepaths_isic): if fileclasses_isic[j] == "pigmented-lesions-benign/melanocytic-nevi/dermoscopy/" and \ count_isic_["pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] < _NUM_TEST_ISIC_PER_CLASS: test_filepaths_isic.append(i) test_fileclasses_isic.append(fileclasses_isic[j]) count_isic_[ "pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] += 1 index_isic_.append(j) count_isic += 1 elif fileclasses_isic[j] == "pigmented-lesions-malignant/melanoma/dermoscopy/" and \ count_isic_["pigmented-lesions-malignant/melanoma/dermoscopy/"] < _NUM_TEST_ISIC_PER_CLASS: test_filepaths_isic.append(i) test_fileclasses_isic.append(fileclasses_isic[j]) count_isic_[ "pigmented-lesions-malignant/melanoma/dermoscopy/"] += 1 index_isic_.append(j) count_isic += 1 if count_isic >= _NUM_TEST_ISIC: break val_dico_isic = [i for i in list(dico_isic.values()) if i is not None] count_training_isic = {} for i in val_dico_isic: count_training_isic[i] = 0 training_filepaths_isic = [] training_fileclasses_isic = [] for i, j in enumerate(filepaths_isic): if i not in index_isic_ and count_training_isic[ fileclasses_isic[i]] < max_class_size: training_filepaths_isic.append(j) training_fileclasses_isic.append(fileclasses_isic[i]) count_training_isic[fileclasses_isic[i]] += 1 ########################## # URLS # ########################## filepaths, fileclasses = _get_filenames_and_classes(dataset_dir) ########################## # MERGE # ########################## filepaths = filepaths + training_filepaths_isic fileclasses = fileclasses + training_fileclasses_isic # Shuffle the data assert (len(filepaths) == len(fileclasses)) random.seed(_RANDOM_SEED) random.shuffle(filepaths) random.seed(_RANDOM_SEED) random.shuffle(fileclasses) ########################## # VAL # ########################## validation_filepaths = [] validation_fileclasses = [] index_ = [] count = 0 count_ = {} count_["benign-dermal-tumors-cysts-sinuses"] = 0 count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] = 0 count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] = 0 count_["epidermal-tumors-pre-malignant-and-malignant"] = 0 count_["genodermatoses-and-supernumerary-growths"] = 0 count_["inflammatory"] = 0 count_["malignant-dermal-tumor"] = 0 count_["pigmented-lesions-benign"] = 0 count_["pigmented-lesions-malignant"] = 0 for j, i in enumerate(filepaths): name_ = fileclasses[j].split('/')[0] if name_ == "benign-dermal-tumors-cysts-sinuses" and \ count_["benign-dermal-tumors-cysts-sinuses"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["benign-dermal-tumors-cysts-sinuses"] += 1 index_.append(j) count += 1 elif name_ == "cutaneous-lymphoma-and-lymphoid-infiltrates" and \ count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] += 1 index_.append(j) count += 1 elif name_ == "epidermal-tumors-hamartomas-milia-and-growths-benign" and \ count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] += 1 index_.append(j) count += 1 elif name_ == "epidermal-tumors-pre-malignant-and-malignant" and \ count_["epidermal-tumors-pre-malignant-and-malignant"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["epidermal-tumors-pre-malignant-and-malignant"] += 1 index_.append(j) count += 1 elif name_ == "genodermatoses-and-supernumerary-growths" and \ count_["genodermatoses-and-supernumerary-growths"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["genodermatoses-and-supernumerary-growths"] += 1 index_.append(j) count += 1 elif name_ == "inflammatory" and \ count_["inflammatory"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["inflammatory"] += 1 index_.append(j) count += 1 elif name_ == "malignant-dermal-tumor" and \ count_["malignant-dermal-tumor"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["malignant-dermal-tumor"] += 1 index_.append(j) count += 1 elif name_ == "pigmented-lesions-benign" and \ count_["pigmented-lesions-benign"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["pigmented-lesions-benign"] += 1 index_.append(j) count += 1 elif name_ == "pigmented-lesions-malignant" and \ count_["pigmented-lesions-malignant"] < _NUM_VALIDATION_PER_CLASS: validation_filepaths.append(i) validation_fileclasses.append(fileclasses[j]) count_["pigmented-lesions-malignant"] += 1 index_.append(j) count += 1 if count >= _NUM_VALIDATION: break ########################## # TRAIN # ########################## training_filepaths = [ filepaths[i] for i in range(len(filepaths)) if i not in index_ ] training_fileclasses = [ fileclasses[i] for i in range(len(fileclasses)) if i not in index_ ] ########################## # CLASS # ########################## class_names = sorted(list(set(fileclasses))) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) ## First, convert the training and validation sets. _convert_dataset('test_isic', test_filepaths_isic, test_fileclasses_isic, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filepaths, validation_fileclasses, class_names_to_ids, dataset_dir) _convert_dataset('train', training_filepaths, training_fileclasses, class_names_to_ids, dataset_dir) # Finally, write the labels file: # We associate a number to finest level classes labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # We associate a number to level nine classes class_names_9 = sorted( list(set([name.split('/')[0] for name in class_names]))) labels_to_class_names_9 = dict( zip(range(len(class_names_9)), class_names_9)) # Mapping from finest level classes to level nine classes labels_to_labels_9 = {} for k, v in labels_to_class_names.items(): name = v.split('/')[0] label_9 = list(labels_to_class_names_9.values()).index(name) labels_to_labels_9[k] = label_9 # Write the corresponding files dataset_utils.write_label_file(labels_to_class_names, dataset_dir + "dataset_test/", filename='labels.txt') dataset_utils.write_label_file(labels_to_class_names_9, dataset_dir + "dataset_test/", filename='labels_9.txt') dataset_utils.write_label_file(labels_to_labels_9, dataset_dir + "dataset_test/", filename='labels_to_labels_9.txt') #_clean_up_temporary_files(dataset_dir) print('\nFinished!')
def main(): args = get_args() # State your dataset directory flags.DEFINE_string('dataset_dir', os.path.expanduser(args.dataset_dir), 'String: Your dataset directory') # The number of images in the validation set. You would have to know the total number of examples in advance. # This is essentially your evaluation dataset. flags.DEFINE_float( 'validation_size', 0.3, 'Float: The proportion of examples in the dataset to be used for validation' ) # The number of shards to split the dataset into flags.DEFINE_integer('num_shards', 2, 'Int: Number of shards to split the TFRecord files') # Seed for repeatability. flags.DEFINE_integer('random_seed', 0, 'Int: Random seed to use for repeatability.') # Output filename for the naming the TFRecord file flags.DEFINE_string( 'tfrecord_filename', args.tfrecord_filename, 'String: The output filename to name your TFRecord file') FLAGS = flags.FLAGS if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') if dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None # Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted # class names from parsing the subdirectories. photo_filenames, class_names = get_filenames_and_classes(FLAGS.dataset_dir) # Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % FLAGS.tfrecord_filename)
def get_split(split_name, dataset_dir, labels_dir=None, file_pattern=None): """Retrieves a InputData object with the parameters for reading ImageNet data. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. labels_dir: The folder where the labels file is located, and where it will be eventually written if missing. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. Returns: An `InputData` object. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not labels_dir: labels_dir = dataset_dir if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = file_pattern % split_name files = [] # Allow for filename expansion w/out using Glob(). # Example: 'train-[0,1023,05d]-of-01024' to generate: # train-00000-of-01024 # train-00001-of-01024 # ... # train-01023-of-01024 m = re.match(r'(.*)\[(\d+),(\d+),([a-zA-Z0-9]+)\](.*)', file_pattern) if m: format_string = '%' + m.group(4) for n in range(int(m.group(2)), int(m.group(3)) + 1): seqstr = format_string % n files.append( os.path.join(dataset_dir, m.group(1) + seqstr + m.group(5))) else: path = os.path.join(dataset_dir, file_pattern) # If the file_pattern ends with '.list', then the file is supposed to be a # file which lists the input files one per line. if path.endswith('.list'): with gfile.Open(path, 'r') as list_file: for fpath in list_file: fpath = fpath.strip() if fpath: files.append(fpath) elif path.find('*') < 0: # If the path does not contain any glob pattern, assume it is a single # input file. Detection for glob patters might be more complex, but all # the examples seen so far, uses '*' only. files.append(path) else: # Otherwise we assume it is a glob-able path. files = gfile.Glob(path) keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(labels_dir): labels_to_names = dataset_utils.read_label_file(labels_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, labels_dir) return InputData(data_sources=files, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)
def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset(data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)
def main(): #=============CHECKS============== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==========END OF CHECKS============ #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir, output_filename=FLAGS.tfrecord_filename) # Some datasets have a "categories" file with actual names those photo_names correspond to # such as UECFOOD256 dataset. Let's map it out. # if os.path.exists(os.path.join(FLAGS.dataset_dir, 'category.txt')): # with open(os.path.join(FLAGS.dataset_dir, 'category.txt')) as cat_file: # replacement_dict = [cat_name.split('\t') for cat_name in cat_file] # class_names = [replacement_dict[int(class_name)][1].replace('\n','') for class_name in class_names] # import pdb; pdb.set_trace() #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))