示例#1
0
def prepare_dataset(converter, dataset_dir, num_images, folds):
    train_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.TRAIN)
    eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL)
    test_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.PREDICT)

    filenames = [train_filename, eval_filename, test_filename]
    files_exist = [tf.gfile.Exists(f) for f in filenames]
    if all(files_exist):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    if any(files_exist):
        print('Some Dataset files already exist but not all of them. Re-creating them.')
        delete_datasets('.', filenames)

    filesnames_by_classes = filenames_by_classes(dataset_dir, num_images, folds)

    with tf.python_io.TFRecordWriter(train_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting {} images.'.format(Modes.TRAIN))
            convert_images(
                session, tfrecord_writer, converter, filesnames_by_classes[Modes.TRAIN])

    with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting {} images.'.format(Modes.EVAL))
            convert_images(
                session, tfrecord_writer, converter, filesnames_by_classes[Modes.EVAL])

    with tf.python_io.TFRecordWriter(test_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting test images.')
            convert_images(session, tfrecord_writer, converter, filesnames_by_classes[Modes.PREDICT])
示例#2
0
def prepare(dataset_dir):
    """Runs download and conversion operation.

    Args:
        dataset_dir: The dataset directory where the dataset is stored.
    """
    make_dataset_dir(dataset_dir)
    if all([
            tf.gfile.Exists(
                RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.TRAIN)),
            tf.gfile.Exists(
                RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL)),
            tf.gfile.Exists(
                RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.PREDICT)),
    ]):
        print('`{}` Dataset files already exist.')
        return

    download_datasets(dataset_dir, _DATA_URL, [_FILENAME])
    with open(os.path.join(dataset_dir, _FILENAME), 'rb') as f:
        train_set = pickle.load(f)
        test_set = pickle.load(f)

    converter = SequenceToTFExampleConverter(
        sequence_features_types={'source_token': 'int'},
        context_features_types={'label': 'int'})

    num_items = len(train_set[0])
    len_eval_data = int(num_items * 0.1)
    len_test_data = len(test_set[0])
    prepare_dataset(converter, dataset_dir, train_set, Modes.TRAIN, num_items,
                    len_eval_data)
    prepare_dataset(converter, dataset_dir, test_set, Modes.PREDICT,
                    len_test_data)

    # Finally, write the meta data:
    with open(META_DATA_FILENAME_FORMAT.format(dataset_dir),
              'w') as meta_data_file:
        meta_data = converter.get_meta_data()
        meta_data['num_samples'] = {
            Modes.TRAIN: num_items - len_eval_data,
            Modes.EVAL: len_eval_data,
            Modes.PREDICT: len_test_data
        }
        meta_data['items_to_descriptions'] = {
            'source_token': 'A sequence of word ids.',
            'label': 'A single integer 0 or 1',
        }
        meta_data['num_classes'] = 2
        json.dump(meta_data, meta_data_file)

    delete_datasets(dataset_dir, [_FILENAME])
    print('\nFinished converting the IMDB dataset!')
示例#3
0
def prepare_dataset(converter, dataset_dir, data_name, num_images, num_eval=0):
    filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, data_name)
    if num_eval:
        eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir,
                                                       ModeKeys.EVAL)

    if tf.gfile.Exists(filename):
        print('`{}` Dataset files already exist. '
              'Exiting without re-creating them.'.format(filename))
        return

    if data_name == ModeKeys.TRAIN:
        filenames = [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME]
    else:
        filenames = [_TEST_DATA_FILENAME, _TEST_LABELS_FILENAME]

    download_datasets(dataset_dir, _DATA_URL, filenames)

    if data_name == ModeKeys.TRAIN:
        data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
    else:
        data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)

    images = _extract_images(data_filename, num_images)
    labels = _extract_labels(labels_filename, num_images)
    if num_eval:
        images, eval_images = images[num_eval:], images[:num_eval]
        labels, eval_labels = labels[num_eval:], labels[:num_eval]

    with tf.python_io.TFRecordWriter(filename) as tfrecord_writer:
        with tf.Session('') as session:
            converter.convert(session=session,
                              writer=tfrecord_writer,
                              images=images,
                              labels=labels,
                              total_num_items=len(images))

    if num_eval:
        with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer:
            with tf.Session('') as session:
                converter.convert(session=session,
                                  writer=tfrecord_writer,
                                  images=eval_images,
                                  labels=eval_labels,
                                  total_num_items=len(eval_images))

    delete_datasets(dataset_dir, filenames)
示例#4
0
def prepare_dataset(converter, dataset_dir, data_name, num_images, num_eval=0):
    filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, data_name)
    if num_eval:
        eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, Modes.EVAL)

    if tf.gfile.Exists(filename):
        print('`{}` Dataset files already exist. '
              'Exiting without re-creating them.'.format(filename))
        return

    if data_name == Modes.TRAIN:
        filenames = [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME]
    else:
        filenames = [_TEST_DATA_FILENAME, _TEST_LABELS_FILENAME]

    download_datasets(dataset_dir, _DATA_URL, filenames)

    if data_name == Modes.TRAIN:
        data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
    else:
        data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)

    images = _extract_images(data_filename, num_images)
    labels = _extract_labels(labels_filename, num_images)
    if num_eval:
        images, eval_images = images[num_eval:], images[:num_eval]
        labels, eval_labels = labels[num_eval:], labels[:num_eval]

    with tf.python_io.TFRecordWriter(filename) as tfrecord_writer:
        with tf.Session('') as session:
            converter.convert(session=session, writer=tfrecord_writer, images=images,
                              labels=labels, total_num_items=len(images))

    if num_eval:
        with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer:
            with tf.Session('') as session:
                converter.convert(session=session, writer=tfrecord_writer, images=eval_images,
                                  labels=eval_labels, total_num_items=len(eval_images))

    delete_datasets(dataset_dir, filenames)
示例#5
0
文件: flowers17.py 项目: vdt/polyaxon
def prepare_dataset(converter, dataset_dir, num_images, folds):
    train_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir,
                                                    ModeKeys.TRAIN)
    eval_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, ModeKeys.EVAL)
    test_filename = RECORD_FILE_NAME_FORMAT.format(dataset_dir, 'test')

    filenames = [train_filename, eval_filename, test_filename]
    files_exist = [tf.gfile.Exists(f) for f in filenames]
    if all(files_exist):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    if any(files_exist):
        print(
            'Some Dataset files already exist but not all of them. Re-creating them.'
        )
        delete_datasets('.', filenames)

    filesnames_by_classes = filenames_by_classes(dataset_dir, num_images,
                                                 folds)

    with tf.python_io.TFRecordWriter(train_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting {} images.'.format(ModeKeys.TRAIN))
            convert_images(session, tfrecord_writer, converter,
                           filesnames_by_classes[ModeKeys.TRAIN])

    with tf.python_io.TFRecordWriter(eval_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting {} images.'.format(ModeKeys.EVAL))
            convert_images(session, tfrecord_writer, converter,
                           filesnames_by_classes[ModeKeys.EVAL])

    with tf.python_io.TFRecordWriter(test_filename) as tfrecord_writer:
        with tf.Session('') as session:
            print('converting test images.')
            convert_images(session, tfrecord_writer, converter,
                           filesnames_by_classes['test'])