Exemplo n.º 1
0
def test_mnist_single_threaded() -> None:
    config = storage.LFSConfigurations(storage_dir_path="/tmp/")
    lfs_storage = storage.LFSStorage(configurations=config)

    dataset_id = "mnist"
    dataset_version = "1"

    util.cleanup_lfs_storage(configurations=config,
                             dataset_id=dataset_id,
                             dataset_version=dataset_version)

    @lfs_storage.cacheable(dataset_id=dataset_id,
                           dataset_version=dataset_version)
    def make_dataset() -> dataref.LMDBDataRef:
        return util.make_mnist_test_dataset()  # type: ignore

    stream_from_cache = make_dataset().stream()
    dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache)
    original_dataset = util.make_mnist_test_dataset()

    data_samples = util.compare_datasets(original_dataset, dataset_from_stream)
    assert data_samples == 10000
    assert stream_from_cache.length == data_samples
    util.cleanup_lfs_storage(configurations=config,
                             dataset_id=dataset_id,
                             dataset_version=dataset_version)
Exemplo n.º 2
0
            def _decorated_fn(*args: Any, **kwargs: Any) -> Any:
                @self._storage.cacheable(  # type: ignore
                    dataset_id=dataset_id,
                    dataset_version=dataset_version,
                )
                def make_dataset() -> yogadl.DataRef:
                    return make_dataset_fn(*args, **kwargs)

                logging.info(
                    f"Preparing dataset: {dataset_id}:{dataset_version}.")
                logging.debug(
                    f"Calling make dataset for: {dataset_id}:{dataset_version} "
                    f"with following start_offset: {self._offset}, "
                    f"shuffle: {shuffle} shuffle_seed: {self._shuffle_seed} "
                    f"shard_rank: {self._shard_rank}, world size: {self._num_shards} "
                    f"training: {self._training}.")

                stream_from_cache = make_dataset().stream(
                    start_offset=self._offset,
                    shuffle=shuffle,
                    skip_shuffle_at_epoch_end=skip_shuffle_at_epoch_end,
                    shuffle_seed=self._shuffle_seed,
                    shard_rank=self._shard_rank,
                    num_shards=self._num_shards,
                    drop_shard_remainder=True if self._training else False,
                )
                self._dataset_length = len(stream_from_cache)
                logging.info(
                    f"Dataset {dataset_id}:{dataset_version} preparation finished."
                )

                return tensorflow.make_tf_dataset(stream_from_cache)
Exemplo n.º 3
0
def compare_performance_tf_record_dataset(data_dir: pathlib.Path) -> None:
    config = storage.LFSConfigurations(storage_dir_path="/tmp/")
    lfs_storage = storage.LFSStorage(configurations=config)

    dataset_id = "imagenet-train"
    dataset_version = "0"
    training = True

    cleanup_lfs_storage(configurations=config,
                        dataset_id=dataset_id,
                        dataset_version=dataset_version)

    @lfs_storage.cacheable(dataset_id=dataset_id,
                           dataset_version=dataset_version)
    def make_dataset() -> dataref.LMDBDataRef:
        return make_dataset_from_tf_records(data_dir=data_dir,
                                            training=training)  # type: ignore

    cache_creation_start_time = time.time()
    stream_from_cache = make_dataset().stream()
    cache_creation_time = time.time() - cache_creation_start_time
    print(f"Cache creation took: {cache_creation_time} seconds.")

    dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache)
    cache_read_time, cache_data_items = read_dataset(
        dataset=dataset_from_stream)
    print(f"Cache read took: {cache_read_time} seconds.")

    original_dataset_read_time, original_data_items = read_dataset(
        dataset=make_dataset_from_tf_records(data_dir=data_dir,
                                             training=training))
    print(f"Original read took: {original_dataset_read_time} seconds.")

    assert cache_data_items == original_data_items
Exemplo n.º 4
0
def worker_using_cacheable(config: storage.S3Configurations, dataset_id: str,
                           dataset_version: str) -> None:
    s3_storage = storage.S3Storage(configurations=config)

    @s3_storage.cacheable(dataset_id=dataset_id,
                          dataset_version=dataset_version)
    def make_dataset() -> dataref.LMDBDataRef:
        return util.make_mnist_test_dataset()  # type: ignore

    stream_from_cache = make_dataset().stream()
    dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache)
    original_dataset = util.make_mnist_test_dataset()

    data_samples = util.compare_datasets(original_dataset, dataset_from_stream)
    assert data_samples == 10000
    assert stream_from_cache.length == data_samples