def test_s3_storage_submit() -> None: range_size = 10 dataset_id = "range-dataset" dataset_version = "0" dataset = tf.data.Dataset.range(range_size) configurations = create_s3_configuration(access_server_port=15032) client = boto3.client("s3") aws_cache_filepath = get_s3_filepath( configurations=configurations, dataset_id=dataset_id, dataset_version=dataset_version, ) try: blob_info = client.head_object(Bucket=configurations.bucket, Key=str(aws_cache_filepath)) previous_creation_time = blob_info.get("LastModified") except boto_client.ClientError: previous_creation_time = None s3_storage = storage.S3Storage(configurations=configurations) s3_storage.submit( data=dataset, dataset_id=dataset_id, dataset_version=dataset_version, ) blob_info = client.head_object(Bucket=configurations.bucket, Key=str(aws_cache_filepath)) assert blob_info.get("LastModified") is not None assert previous_creation_time != blob_info.get("LastModified") if previous_creation_time is not None: assert previous_creation_time < blob_info.get("LastModified")
def worker_using_cacheable(config: storage.S3Configurations, dataset_id: str, dataset_version: str) -> None: s3_storage = storage.S3Storage(configurations=config) @s3_storage.cacheable(dataset_id=dataset_id, dataset_version=dataset_version) def make_dataset() -> dataref.LMDBDataRef: return util.make_mnist_test_dataset() # type: ignore stream_from_cache = make_dataset().stream() dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache) original_dataset = util.make_mnist_test_dataset() data_samples = util.compare_datasets(original_dataset, dataset_from_stream) assert data_samples == 10000 assert stream_from_cache.length == data_samples
def worker(configurations: storage.S3Configurations, dataset_id: str, dataset_version: str) -> None: range_size = 120 s3_storage = storage.S3Storage(configurations=configurations) @s3_storage.cacheable(dataset_id, dataset_version) def make_dataref(input_range_size: int) -> dataref.LMDBDataRef: return tf.data.Dataset.range(input_range_size) # type: ignore stream = make_dataref(input_range_size=range_size).stream() assert stream.length == range_size data_generator = stream.iterator_fn() generator_length = 0 for idx, data in enumerate(data_generator): assert idx == data generator_length += 1 assert generator_length == range_size
def test_s3_storage_local_metadata() -> None: range_size = 10 dataset_id = "range-dataset" dataset_version = "0" dataset = tf.data.Dataset.range(range_size) configurations = create_s3_configuration(access_server_port=15032) client = boto3.client("s3") aws_cache_filepath = get_s3_filepath( configurations=configurations, dataset_id=dataset_id, dataset_version=dataset_version, ) s3_storage = storage.S3Storage(configurations=configurations) s3_storage.submit( data=dataset, dataset_id=dataset_id, dataset_version=dataset_version, ) local_metadata_filepath = get_local_metadata_filepath( configurations=configurations, dataset_id=dataset_id, dataset_version=dataset_version) with open(str(local_metadata_filepath), "r") as metadata_file: metadata = json.load(metadata_file) blob_info = client.head_object(Bucket=configurations.bucket, Key=str(aws_cache_filepath)) creation_time = blob_info.get("LastModified") assert metadata.get("time_created") assert creation_time.timestamp() == metadata["time_created"] local_metadata_filepath.unlink() _ = s3_storage.fetch(dataset_id=dataset_id, dataset_version=dataset_version) with open(str(local_metadata_filepath), "r") as metadata_file: metadata = json.load(metadata_file) assert metadata.get("time_created") assert creation_time.timestamp() == metadata["time_created"]
def test_s3_storage_cacheable_single_threaded() -> None: original_range_size = 120 updated_range_size = 55 dataset_id = "range-dataset" dataset_version = "0" configurations = create_s3_configuration(access_server_port=15032) access_server_handler = test_util.AccessServerHandler(hostname="localhost", port=15032) access_server_handler.run_server_in_thread() s3_cache_filepath = get_s3_filepath( configurations=configurations, dataset_id=dataset_id, dataset_version=dataset_version, ) client = boto3.client("s3") client.delete_object(Bucket=configurations.bucket, Key=str(s3_cache_filepath)) s3_storage = storage.S3Storage(configurations=configurations) @s3_storage.cacheable(dataset_id, dataset_version) def make_dataref(range_size: int) -> dataref.LMDBDataRef: return tf.data.Dataset.range(range_size) # type: ignore original_data_stream = make_dataref( range_size=original_range_size).stream() assert original_data_stream.length == original_range_size data_generator = original_data_stream.iterator_fn() generator_length = 0 for idx, data in enumerate(data_generator): assert idx == data generator_length += 1 assert generator_length == original_range_size updated_data_stream = make_dataref(range_size=updated_range_size).stream() assert updated_data_stream.length == original_range_size access_server_handler.stop_server()
def test_s3_storage_submit_and_fetch() -> None: range_size = 20 dataset_id = "range-dataset" dataset_version = "0" dataset = tf.data.Dataset.range(range_size) configurations = create_s3_configuration(access_server_port=15032) s3_storage = storage.S3Storage(configurations=configurations) s3_storage.submit( data=dataset, dataset_id=dataset_id, dataset_version=dataset_version, ) dataref = s3_storage.fetch(dataset_id=dataset_id, dataset_version=dataset_version) stream = dataref.stream() assert stream.length == range_size data_generator = stream.iterator_fn() generator_length = 0 for idx, data in enumerate(data_generator): assert idx == data generator_length += 1 assert generator_length == range_size
def _configure_storage(self) -> None: session_config = None # type: Optional[tf.compat.v1.ConfigProto] if self._hvd_config.use: # For multi-GPU training, we map processes to individual GPUs. TF requires # that for each instantiation of `tf.Session`, the process is mapped # to the same GPU. session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.visible_device_list = str( hvd.local_rank()) scheme = "wss" if self._env.use_tls else "ws" rw_coordinator_url = ( f"{scheme}://{self._env.master_addr}:{self._env.master_port}/ws/data-layer/" ) data_layer_type = self._env.experiment_config.get_data_layer_type() if data_layer_type == StorageTypes.SHARED_FS.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("container_storage_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.LFSConfigurations( storage_dir_path=str(local_cache_path)) self._storage = storage.LFSStorage( storage_config, tensorflow_config=session_config) elif data_layer_type == StorageTypes.S3.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("local_cache_container_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.S3Configurations( bucket=self._env.experiment_config["data_layer"]["bucket"], bucket_directory_path=self._env.experiment_config["data_layer"] ["bucket_directory_path"], url=rw_coordinator_url, local_cache_dir=str(local_cache_path), access_key=self._env.experiment_config["data_layer"].get( "access_key"), secret_key=self._env.experiment_config["data_layer"].get( "secret_key"), endpoint_url=self._env.experiment_config["data_layer"].get( "endpoint_url"), coordinator_cert_file=self._env.master_cert_file, coordinator_cert_name=self._env.master_cert_name, ) self._storage = storage.S3Storage(storage_config, tensorflow_config=session_config) elif data_layer_type == StorageTypes.GCS.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("local_cache_container_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.GCSConfigurations( bucket=self._env.experiment_config["data_layer"]["bucket"], bucket_directory_path=self._env.experiment_config["data_layer"] ["bucket_directory_path"], url=rw_coordinator_url, local_cache_dir=str(local_cache_path), coordinator_cert_file=self._env.master_cert_file, coordinator_cert_name=self._env.master_cert_name, ) self._storage = storage.GCSStorage( storage_config, tensorflow_config=session_config) else: raise AssertionError( "Please select a supported data_layer type. Supported types include: " f"{[i.value for i in StorageTypes]}")