def prepare_dataset(converter, dataset_dir, num_images, folds): train_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.TRAIN) eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL) test_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.PREDICT) filenames = [train_filename, eval_filename, test_filename] files_exist = [tf.gfile.Exists(f) for f in filenames] if all(files_exist): print('Dataset files already exist. Exiting without re-creating them.') return if any(files_exist): print('Some Dataset files already exist but not all of them. Re-creating them.') delete_datasets('.', filenames) filesnames_by_classes = filenames_by_classes(dataset_dir, num_images, folds) with tf.python_io.TFRecordWriter(train_filename) as tfrecord_writer: with tf.Session('') as session: print('converting {} images.'.format(Modes.TRAIN)) convert_images( session, tfrecord_writer, converter, filesnames_by_classes[Modes.TRAIN]) with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer: with tf.Session('') as session: print('converting {} images.'.format(Modes.EVAL)) convert_images( session, tfrecord_writer, converter, filesnames_by_classes[Modes.EVAL]) with tf.python_io.TFRecordWriter(test_filename) as tfrecord_writer: with tf.Session('') as session: print('converting test images.') convert_images(session, tfrecord_writer, converter, filesnames_by_classes[Modes.PREDICT])
def prepare(dataset_dir): """Runs download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ make_dataset_dir(dataset_dir) if all([ tf.gfile.Exists( RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.TRAIN)), tf.gfile.Exists( RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL)), tf.gfile.Exists( RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.PREDICT)), ]): print('`{}` Dataset files already exist.') return download_datasets(dataset_dir, _DATA_URL, [_FILENAME]) with open(os.path.join(dataset_dir, _FILENAME), 'rb') as f: train_set = pickle.load(f) test_set = pickle.load(f) converter = SequenceToTFExampleConverter( sequence_features_types={'source_token': 'int'}, context_features_types={'label': 'int'}) num_items = len(train_set[0]) len_eval_data = int(num_items * 0.1) len_test_data = len(test_set[0]) prepare_dataset(converter, dataset_dir, train_set, Modes.TRAIN, num_items, len_eval_data) prepare_dataset(converter, dataset_dir, test_set, Modes.PREDICT, len_test_data) # Finally, write the meta data: with open(META_DATA_FILENAME_FORMAT.format(dataset_dir), 'w') as meta_data_file: meta_data = converter.get_meta_data() meta_data['num_samples'] = { Modes.TRAIN: num_items - len_eval_data, Modes.EVAL: len_eval_data, Modes.PREDICT: len_test_data } meta_data['items_to_descriptions'] = { 'source_token': 'A sequence of word ids.', 'label': 'A single integer 0 or 1', } meta_data['num_classes'] = 2 json.dump(meta_data, meta_data_file) delete_datasets(dataset_dir, [_FILENAME]) print('\nFinished converting the IMDB dataset!')
def prepare_dataset(converter, dataset_dir, data_name, num_images, num_eval=0): filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, data_name) if num_eval: eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, ModeKeys.EVAL) if tf.gfile.Exists(filename): print('`{}` Dataset files already exist. ' 'Exiting without re-creating them.'.format(filename)) return if data_name == ModeKeys.TRAIN: filenames = [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME] else: filenames = [_TEST_DATA_FILENAME, _TEST_LABELS_FILENAME] download_datasets(dataset_dir, _DATA_URL, filenames) if data_name == ModeKeys.TRAIN: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) else: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) images = _extract_images(data_filename, num_images) labels = _extract_labels(labels_filename, num_images) if num_eval: images, eval_images = images[num_eval:], images[:num_eval] labels, eval_labels = labels[num_eval:], labels[:num_eval] with tf.python_io.TFRecordWriter(filename) as tfrecord_writer: with tf.Session('') as session: converter.convert(session=session, writer=tfrecord_writer, images=images, labels=labels, total_num_items=len(images)) if num_eval: with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer: with tf.Session('') as session: converter.convert(session=session, writer=tfrecord_writer, images=eval_images, labels=eval_labels, total_num_items=len(eval_images)) delete_datasets(dataset_dir, filenames)
def prepare_dataset(converter, dataset_dir, data_name, num_images, num_eval=0): filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, data_name) if num_eval: eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL) if tf.gfile.Exists(filename): print('`{}` Dataset files already exist. ' 'Exiting without re-creating them.'.format(filename)) return if data_name == Modes.TRAIN: filenames = [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME] else: filenames = [_TEST_DATA_FILENAME, _TEST_LABELS_FILENAME] download_datasets(dataset_dir, _DATA_URL, filenames) if data_name == Modes.TRAIN: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) else: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) images = _extract_images(data_filename, num_images) labels = _extract_labels(labels_filename, num_images) if num_eval: images, eval_images = images[num_eval:], images[:num_eval] labels, eval_labels = labels[num_eval:], labels[:num_eval] with tf.python_io.TFRecordWriter(filename) as tfrecord_writer: with tf.Session('') as session: converter.convert(session=session, writer=tfrecord_writer, images=images, labels=labels, total_num_items=len(images)) if num_eval: with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer: with tf.Session('') as session: converter.convert(session=session, writer=tfrecord_writer, images=eval_images, labels=eval_labels, total_num_items=len(eval_images)) delete_datasets(dataset_dir, filenames)
def prepare_dataset(converter, dataset_dir, num_images, folds): train_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, ModeKeys.TRAIN) eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, ModeKeys.EVAL) test_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, 'test') filenames = [train_filename, eval_filename, test_filename] files_exist = [tf.gfile.Exists(f) for f in filenames] if all(files_exist): print('Dataset files already exist. Exiting without re-creating them.') return if any(files_exist): print( 'Some Dataset files already exist but not all of them. Re-creating them.' ) delete_datasets('.', filenames) filesnames_by_classes = filenames_by_classes(dataset_dir, num_images, folds) with tf.python_io.TFRecordWriter(train_filename) as tfrecord_writer: with tf.Session('') as session: print('converting {} images.'.format(ModeKeys.TRAIN)) convert_images(session, tfrecord_writer, converter, filesnames_by_classes[ModeKeys.TRAIN]) with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer: with tf.Session('') as session: print('converting {} images.'.format(ModeKeys.EVAL)) convert_images(session, tfrecord_writer, converter, filesnames_by_classes[ModeKeys.EVAL]) with tf.python_io.TFRecordWriter(test_filename) as tfrecord_writer: with tf.Session('') as session: print('converting test images.') convert_images(session, tfrecord_writer, converter, filesnames_by_classes['test'])