def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir, dataset): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train', dataset=dataset) testing_filename = _get_output_filename(dataset_dir, 'test', dataset=dataset) if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL[dataset], dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: offset = 0 for i in range(_NUM_TRAIN_FILES[dataset]): filename = os.path.join(dataset_dir, _DATA_DIR[dataset], _batch_name('train', offset=i, dataset=dataset)) offset = _add_to_tfrecord(filename, tfrecord_writer, dataset, offset) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = os.path.join(dataset_dir, _DATA_DIR[dataset], _batch_name('test', offset=0, dataset=dataset)) _add_to_tfrecord(filename, tfrecord_writer, dataset) # Finally, write the labels file: labels_to_class_names = dict(enumerate(_CLASS_NAMES[dataset])) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) if dataset == 'cifar100': coarse_labels_to_class_names = dict(enumerate(_COARSE_CLASS_NAMES)) dataset_utils.write_label_file(coarse_labels_to_class_names, dataset_dir, filename=_COARSE_LABELS_FILENAME) _clean_up_temporary_files(dataset_dir, dataset) print('\nFinished converting the %s dataset!' % dataset)
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: offset = 0 for i in range(_NUM_TRAIN_FILES): filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'data_batch_%d' % (i + 1)) # 1-indexed. offset = _add_to_tfrecord(filename, tfrecord_writer, offset) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch') _add_to_tfrecord(filename, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the Cifar10 dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ print(dataset_dir) if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes('/home/hoangtrunghieu/Medico2018/imdb/Medico_2018_development_set') # photo_filenames, class_names = _get_filenames_and_classes('/home/hoangtrunghieu/Medico2018/imdb/medico_full/images/kvasir-dataset-v2') class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) _NUM_VALIDATION = int(len(photo_filenames) * _SPLIT_VALIDATION) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] print(class_names) # # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Medico dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ ''' if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return ''' # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) # print(len(photo_filenames)) # print(class_names) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: # random.seed(_RANDOM_SEED) # random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print(labels_to_class_names) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ print('running') if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists( testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return _download_dataset(dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the MNIST dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: offset = 0 for i in range(_NUM_TRAIN_FILES): filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'data_batch_%d' % (i + 1)) # 1-indexed. offset = _add_to_tfrecord(filename, tfrecord_writer, offset) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch') _add_to_tfrecord(filename, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Cifar10 dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) # if _dataset_exists(dataset_dir): # print('Dataset files already exist. Exiting without re-creating them.') # return # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes("/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/train") valid_photo_filenames, class_names_v = _get_filenames_and_classes("/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/validation") #valid_photo_filenames, class_names_v = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames #training_filenames = photo_filenames[_NUM_VALIDATION:] print(photo_filenames) #validation_filenames = photo_filenames[:_NUM_VALIDATION] validation_filenames = valid_photo_filenames # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, "/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/train") _convert_dataset('validation', validation_filenames, class_names_to_ids, "/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/validation") # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the 2T IRMA dataset!')
def run(dataset_dir, train_name_list, test_name_list): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ #if not tf.gfile.Exists(dataset_dir): # tf.gfile.MakeDirs(dataset_dir) #if _dataset_exists(dataset_dir): # print('Dataset files already exist. Exiting without re-creating them.') # return print(dataset_dir) #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) print(photo_filenames[0]) print('training size : ' + str(len(train_name_list))) print('testing size : ' + str(len(test_name_list))) #training_filenames = photo_filenames[_NUM_VALIDATION:] #validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', train_name_list, class_names_to_ids, dataset_dir) print('finished converting training') _convert_dataset('validation', test_name_list, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the nucleui dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.shuffle(photo_filenames) validation_filenames = photo_filenames[:_NUM_VALIDATION] training_filenames = photo_filenames[_NUM_VALIDATION:] #f = open("tmp/colon/test.txt", "w") #for i in test_filenames: # f.write(i + "\n") #f.close() # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the colon dataset!')
def run(dataset_dir): """Runs the conversion operation Args: dataset_dir: the dataset directory where the dataset is stored """ photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[:_NUM_TRAIN] validation_filenames = photo_filenames[_NUM_TRAIN:] path = os.path.join(dataset_dir, 'pathology_splits.txt') print('Creating file: ' + path) f = open(path, 'w+') f.write('Training Files:\n') for training_filename in training_filenames: f.write(training_filename + '\n') f.write('Validation Files:\n') for validation_filename in validation_filenames: f.write(validation_filename + '\n') #f.write('Testing Files:\n') #for testing_filename in testing_filenames: # f.write(testing_filename + '\n') _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # _convert_dataset('test', testing_filenames, class_names_to_ids, dataset_dir) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nFinished converting the Pathology dataset!')
def run(dataset_dir, tf_record_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(tf_record_dir): tf.gfile.MakeDirs(tf_record_dir) if _dataset_exists(tf_record_dir): print( 'TF Record Dataset files already exist. Exiting without re-creating them.' ) return # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #print("\n\n",photo_filenames,"\n\n") print("class_names_to_ids : ", class_names_to_ids) # Find the number of validation examples we need num_validation = int(_VALIDATION_SIZE * len(photo_filenames)) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] #First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, tf_record_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, tf_record_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, tf_record_dir)
def run(dataset_name, dataset_dir, num_shards, ratio_val): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ tf_record_dir = os.path.join(dataset_dir, 'tfrecord') if not tf.gfile.Exists(tf_record_dir): tf.gfile.MakeDirs(tf_record_dir) if _dataset_exists(tf_record_dir, dataset_name, num_shards): print( 'TFRecord files already exist. Exiting without re-creating them.') return photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Cacluate number of validation proportional to ratio_val num_validation = int(len(photo_filenames) * ratio_val) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir, dataset_name, tf_record_dir, num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir, dataset_name, tf_record_dir, num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, tf_record_dir) print('\nFinished converting the dataset!')
def main(_): if not FLAGS.dataset_name: raise ValueError( 'You must supply the dataset name with --dataset_name') if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') dataset_dir = FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the %s dataset!' % FLAGS.dataset_name)
def main(_): dataset_dir = FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if dataset_exists(dataset_dir): print('error: 数据集已存在') exit() photo_filenames, class_names = get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) # 数据集转换 convert_dataset(photo_filenames, class_names_to_ids, dataset_dir) # 保存类别标签 labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nDone')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) validation_filenames, training_filenames, class_names = _get_filenames_and_classes( ) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ photo_filenames = get_filenames(dataset_dir, jpg_or_tiff='jpg') labels_csv = read_labels(dataset_dir) class_names = get_classnames(labels_csv) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) output_dir = os.path.join(dataset_dir, 'tensorflow') random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] test_filenames = get_filenames(dataset_dir, jpg_or_tiff='jpg', test_or_train='test') fnames_to_class_ids = {} for fname, tags in zip(labels_csv.image_name, labels_csv.tags): fnames_to_class_ids[fname] = [ class_names_to_ids[i] for i in tags.split() ] convert_dataset('test', test_filenames, None, output_dir) convert_dataset('train', training_filenames, fnames_to_class_ids, output_dir) convert_dataset('validation', validation_filenames, fnames_to_class_ids, output_dir) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, output_dir) print('\nFinished converting!')
def run(dataset_dir, output_dir, filename, data_type, num_tfrecords): """Runs the download and conversion operation. Args: dataset_dir: The directory where the dataset is stored. output_dir: The directory where the tfrecords should be stored. filename: Name of a txt file that stores all the training data details. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) print('\nNeed to download dataset first!') return # the name of the converted tfrecord TF_filename = _get_output_filename(output_dir, data_type, 0, num_tfrecords) if tf.gfile.Exists(TF_filename): print( '\nDataset files already exist. Remove them and recreate a new directory.' ) shutil.rmtree(output_dir) os.makedirs(output_dir) # process the training data: filenames, tracklet_ids, cam_ids \ = _get_image_filenames_and_labels(filename, data_type) _write_to_tfrecord(filenames, tracklet_ids, cam_ids, dataset_dir, output_dir, data_type, num_tfrecords) unique_labels = list(set(tracklet_ids)) unique_labels.sort() labels_to_write = dict(zip(range(len(unique_labels)), unique_labels)) dataset_utils.write_label_file(labels_to_write, output_dir) print('\nFinished converting the training data!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: print("len(class_names)", len(class_names)) labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # Uncomment if cleaning the data files is desired. # # _clean_up_temporary_files(dataset_dir) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the CASIA_NDIRIS dataset!')
def run(training_dataset_dir, testing_dataset_dir, convert_dataset_dir): """Runs the conversion operation. Args: dataset_dir: The dataset directory where the converted dataset is stored. input_dir: The directory where input photos are stored. """ training_photo_filenames, training_class_names = _get_filenames_and_classes( training_dataset_dir) testing_photo_filenames, testing_class_names = _get_filenames_and_classes( testing_dataset_dir) class_names_to_ids = dict( zip(training_class_names, range(len(training_class_names)))) if len(training_class_names) != len(testing_class_names): raise ValueError( 'The training and testing datasets must contain the same classes') # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(training_photo_filenames) random.shuffle(testing_photo_filenames) # First, convert the training and validation sets. _convert_dataset('train', training_photo_filenames, class_names_to_ids, convert_dataset_dir) _convert_dataset('validation', testing_photo_filenames, class_names_to_ids, convert_dataset_dir) # Finally, write the labels file: labels_to_class_names = dict( zip(range(len(training_class_names)), training_class_names)) dataset_utils.write_label_file(labels_to_class_names, convert_dataset_dir) print('\nFinished converting the ICWT dataset!')
def run(dataset_dirs): """Runs the download and conversion operation. Args: dataset_dirs list that contains: -train_dir: The dataset directory where the train dataset is stored. -test_dir: The dataset directory where the test dataset is stored. """ if os.path.isdir('tf_data') == False: os.makedirs('tf_data') for dir_index in range(len(dataset_dirs)): dataset_dir = dataset_dirs[dir_index] dataset_type = 'train' if dir_index == 1: dataset_type = 'validation' if not tf.gfile.Exists(dataset_dir): raise ValueError('train_dir does not exist') photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) myfile = open('tf_data/' + dataset_type + '.txt', 'w') myfile.write(str(len(photo_filenames))) myfile.close() class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # First, convert the training and validation sets. _convert_dataset(dataset_type, photo_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, 'tf_data') print('\nFinished converting the Mushroom dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return _download_dataset(dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the MNIST dataset!')
def run(np_dir, tf_dir): """Runs the download and conversion operation. """ if not tf.gfile.Exists(tf_dir): tf.gfile.MakeDirs(tf_dir) if _dataset_exists(tf_dir): print('Dataset files already exist. Exiting without re-creating them.') return photo_filenames, class_names = _get_filenames_and_classes(np_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, tf_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, tf_dir) image_count = os.path.join(tf_dir, 'image_count.txt') with tf.gfile.Open(image_count, 'w') as f: f.write('%d\n' % len(training_filenames)) f.write('%d\n' % len(validation_filenames)) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, tf_dir) _clean_up_temporary_files(tf_dir) print('\nFinished converting the dataset!')
def main(_): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ dataset_dir=FLAGS.dataset_dir if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return if FLAGS.grey==True: _change_grow(dataset_dir) #tranfrom grayscale #_change_grow(dataset_dir) # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) # First, convert the training and validation sets. _convert_dataset(photo_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, "./data_train") #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the customized dataset at directory: {0}'.format(dataset_dir))
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): # 路径不存在 tf.gfile.MakeDirs(dataset_dir) # 新建 if _dataset_exists(dataset_dir): # 查看tfr数据是否已经存在 print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # 将类别名与id对应 # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) # 将文件路径名随机打乱,实现数据打乱 training_filenames = photo_filenames[_NUM_VALIDATION:] # 取其余做训练 validation_filenames = photo_filenames[:_NUM_VALIDATION] # 350个做验证 # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) # 转成tfr数据 _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # 转成tfr数据 # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # 让id与文件名(类别)对应起来 dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir, create_label_dict): """Runs the conversion operation. Args: dataset_dir: The root directory. """ if not tf.gfile.Exists(dataset_dir): raise ValueError('The dataset directory must exist.') # Process for each of the data splits: for idx, split in enumerate(_SPLIT_NAMES): data_filename = os.path.join(dataset_dir, _DATA_FILENAMES[idx]) label_filename = os.path.join(dataset_dir, _LABEL_FILENAMES[idx]) _convert_to_tfrecord(dataset_dir, split, data_filename, label_filename) if create_label_dict == True: class_filename = os.path.join(dataset_dir, _CLASSNAMES_FILENAME) with open(class_filename) as fClassNames: class_names = fClassNames.read().splitlines() labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nFinished converting the NUSWIDE dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ tf_record_path = os.path.join(dataset_dir, 'tf_record') if not tf.gfile.Exists(tf_record_path): tf.gfile.MakeDirs(tf_record_path) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: total = len(photo_filenames) # 20% for validation num_val = int(0.2 * total) num_train = total - num_val random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_val:] validation_filenames = photo_filenames[:num_val] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, tf_record_path) dataset_utils.write_split_file(num_train, num_val, tf_record_path) # _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!')
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ # TODO: train with our own dataset modified by sdukaka # if not tf.gfile.Exists(dataset_dir): # tf.gfile.MakeDirs(dataset_dir) # # if _dataset_exists(dataset_dir): # print('Dataset files already exist. Exiting without re-creating them.') # return # photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) # TODO: only generate train data modified by sdukaka training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) print('\nFinished converting the Chairs dataset!')
def run(dataset_dir, custom_binary_validation=None, custom_binary_validation_label=None, custom_binary_validation_ratio=None, output_suffix=None, is_other_dir=None): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if is_other_dir: run_other_dir(dataset_dir, output_suffix) if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return random.seed(_RANDOM_SEED) # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) if custom_binary_validation: tmp_photo_filenames, class_names = _get_filenames_and_classes_by_label( dataset_dir, 'apparelv_binary_without_dummy') if not custom_binary_validation_ratio: custom_binary_validation_ratio = 0. if custom_binary_validation_ratio > 1: custom_binary_validation_ratio = 1. validation_filenames = [] training_filenames = [] for key in tmp_photo_filenames: if key == custom_binary_validation_label: ratio = custom_binary_validation_ratio else: ratio = 1. - custom_binary_validation_ratio random.shuffle(tmp_photo_filenames[key]) training_filenames += tmp_photo_filenames[key][ int(_NUM_VALIDATION * ratio):] print(key, len(tmp_photo_filenames[key][:int(_NUM_VALIDATION * ratio)])) validation_filenames += tmp_photo_filenames[ key][:int(_NUM_VALIDATION * ratio)] else: photo_filenames, class_names = _get_filenames_and_classes( dataset_dir, 'apparelv_binary_without_dummy') # Divide into train and test: print("Now let's start converting the Koreans dataset!") random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir, output_suffix) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir, output_suffix) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) if output_suffix: dataset_utils.write_label_file(labels_to_class_names, dataset_dir, 'labels_' + output_suffix + '.txt') else: dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # _clean_up_temporary_files(dataset_dir, 'apparel') print('\nFinished converting the Koreans dataset!')
def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature( (), tf.string, default_value=''), 'image/format': tf.FixedLenFeature( (), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature( [], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature( [], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature( dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature( dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)
""" if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') else: # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) 不用下载 photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) # 每一类别都转换成一个id class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) # 其实已经打乱了顺序 training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # _clean_up_temporary_files(dataset_dir+"flowers_photos") print('\nFinished converting the Cottons dataset!')
def convert_img_to_tfrecord(project_dir, dataset_name, dataset_dir, image_dir, train_percentage, validation_percentage, test_percentage, image_height, image_width, **kwargs): """Runs conversion operation. Args: project_dir: (Default) Directory where the newly created dataset with tfrecord will be stored. dataset_name: The name of dataset that is created from input dataset. dataset_dir: (optional) Directory where the newly created dataset with tfrecord will be stored. image_dir: The dataset directory where the dataset is stored. train_percentage: train dataset validation_percentage: validation dataset test_percentage: test dataset image_height: Target image height for tfrecord. image_width: Target image width for tfrecord. """ # print(dataset_dir) # if not os.listdir(image_dir): # raise ValueError('No label folders found in image directory --image_dir') if not image_dir: raise ValueError('You must supply a image directory with --image_dir') if dataset_dir: dataset_dir = os.path.join(dataset_dir, dataset_name) else: # initialize default directories dataset_dir = os.path.join(os.path.join(project_dir, 'datasets'), dataset_name) # delete dataset directory if it exists if os.path.exists(dataset_dir): shutil.rmtree(dataset_dir) # call convert dataset function if len(os.listdir(image_dir)): # create new dataset directory if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if train_percentage + validation_percentage + test_percentage > 100: raise ValueError('The sum of train, validation, and test percentages can not be greater than 100') photo_filenames, class_names = _get_filenames_and_classes(image_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) class_id = [class_names_to_ids[x.split('/')[-2]] for x in photo_filenames] # print('############',len(class_id)) # Divide into train, validation and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) dataset_split = dict() training_filenames = photo_filenames[:] if test_percentage > 0: training_filenames, test_filenames = train_test_split(training_filenames, test_size=test_percentage/100, random_state=_RANDOM_SEED, stratify=class_id) test_size = len(test_filenames) print('Number of test images: ', test_size) num_samples_per_class = _convert_dataset('test', test_filenames, class_names_to_ids, dataset_dir, dataset_name, image_height, image_width) dataset_split['test'] = test_size dataset_split['test_per_class'] = num_samples_per_class if validation_percentage > 0: training_filenames, validation_filenames = train_test_split(training_filenames, test_size=validation_percentage/100, random_state=_RANDOM_SEED) validation_size = len(validation_filenames) print('Number of validation images: ', validation_size) num_samples_per_class = _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir, dataset_name, image_height, image_width) dataset_split['validation'] = validation_size dataset_split['validation_per_class'] = num_samples_per_class if train_percentage > 0: training_filenames, train_filenames = train_test_split(training_filenames, test_size=train_percentage/100, random_state=_RANDOM_SEED) train_size = len(train_filenames) print('Number of training images: ', train_size) num_samples_per_class = _convert_dataset('train', train_filenames, class_names_to_ids, dataset_dir, dataset_name, image_height, image_width) dataset_split['train'] = train_size dataset_split['train_per_class'] = num_samples_per_class # Finally, write the label and dataset json files: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) dataset_utils.write_dataset_config_json(dataset_name, dataset_dir, class_names, dataset_split) print('\nFinished converting the ',dataset_name,' dataset! under the following directory', dataset_dir) return dataset_dir else: raise ValueError( 'image directory --image_dir=[%s] is empty'.format(image_dir))
def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) plant_filenames = photo_filenames['plants'] no_plant_filenames = photo_filenames['no_plants'] # check no_plant_filenames for corrupted images corrupted_images = 0 no_plant_save_filenames = [] print(len(no_plant_filenames)) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(plant_filenames) random.shuffle(no_plant_save_filenames) training_filenames = plant_filenames[_NUM_PLANT_VALIDATION:] training_filenames.extend( no_plant_save_filenames[_NUM_NO_PLANT_VALIDATION:91758]) random.seed(_RANDOM_SEED) random.shuffle(training_filenames) validation_plant_filenames = plant_filenames[:_NUM_PLANT_VALIDATION] validation_no_plant_filenames = no_plant_save_filenames[: _NUM_NO_PLANT_VALIDATION] validation_filenames = validation_plant_filenames validation_filenames.extend(validation_no_plant_filenames) random.seed(_RANDOM_SEED) random.shuffle(validation_filenames) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir, _NUM_SHARDS_TRAINING) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir, _NUM_SHARDS_VALIDATION) # Second, convert the validation sets for plant and no_plant separately #_convert_dataset('validation_plant', validation_plant_filenames, class_names_to_ids, #dataset_dir, _NUM_SHARDS_SPLIT) #_convert_dataset('validation_no_plant', validation_no_plant_filenames, class_names_to_ids, #dataset_dir, _NUM_SHARDS_SPLIT) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the PlantVsNoplant dataset!')
def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset(data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)
def run(dataset_dir, dataset_type): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. dataset_type: One of "train", "test", "validation", which set should be downloaded and decompressed """ if(dataset_type == "train2014"): _DATA_URL = url_train_2014 elif(dataset_type == "test2014"): _DATA_URL = url_test_2014_1 elif(dataset_type == "validation2014"): _DATA_URL = url_test_2014_2 elif(dataset_type == "train2016"): _DATA_URL = url_train_2016 elif(dataset_type == "test2016"): _DATA_URL = url_test_2016_1 elif(dataset_type == "train2016_2"): _DATA_URL = url_train_2016_2 else: print("There exists no such dataset %s, please choose one of 'train2014', 'test2014', 'validation2014','train2016', 'test2016', 'train2016_2'." % dataset_type) if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return # TODO Downloading #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) if dataset_type != "test2016": print("Getting filenames and class_id's ") # Extract and save pictures and class names in dictionary photo_filenames, image_to_id, id_to_class_name, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) print("All filenames and class_id's found") # Divide into train and test: TODO division necessary if packages (urls) contain splits? random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training1_filenames = photo_filenames[:_NUM_TRAIN_SET1] training2_filenames = photo_filenames[_NUM_TRAIN_SET1:_NUM_TRAIN_SET2] training3_filenames = photo_filenames[_NUM_TRAIN_SET2:] # First, convert the training and validation sets. _convert_dataset('train_set1', training1_filenames, class_names_to_ids, image_to_id, dataset_dir) _convert_dataset('train_set2', training2_filenames, class_names_to_ids, image_to_id, dataset_dir) _convert_dataset('train_set3', training3_filenames, class_names_to_ids, image_to_id, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # writes "labels.txt" with id (1:1000) -> class_id (e.g. 17266) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) # writes "labels2.txt" with class_id (e.g. 17266) -> class_name (e.g. rose ...) dataset_utils.write_label_file(id_to_class_name, dataset_dir, filename="labels2.txt") _clean_up_temporary_files(_DATA_URL, dataset_dir) print('\nFinished converting the Flowers dataset!') else: photo_filenames, image_to_media_id = _get_filenames_and_classes(dataset_dir, is_train_set=False) _convert_dataset('test_set', photo_filenames, None, image_to_media_id, dataset_dir, is_train_set=False) print('\nFinished converting the Flowers test dataset!')