Exemplo n.º 1
0
  def testCompression(self, element):
    element = element._obj

    compressed = compression_ops.compress(element)
    uncompressed = compression_ops.uncompress(
        compressed, structure.type_spec_from_value(element))
    self.assertValuesEqual(element, self.evaluate(uncompressed))
Exemplo n.º 2
0
 def testCompressionVariantMismatch(self):
   # Use a dataset as a variant.
   dataset = dataset_ops.Dataset.range(10)
   variant = dataset._variant_tensor
   with self.assertRaises(errors.InvalidArgumentError):
     uncompressed = compression_ops.uncompress(variant, dataset.element_spec)
     self.evaluate(uncompressed)
Exemplo n.º 3
0
 def testDatasetVariantMismatch(self):
   # Use a nested dataset as an example of a variant.
   dataset = dataset_ops.Dataset.from_tensors(dataset_ops.Dataset.range(10))
   with self.assertRaises(TypeError):
     dataset = dataset.map(
         lambda x: compression_ops.uncompress(x, dataset.element_spec))
     self.getDatasetOutput(dataset)
Exemplo n.º 4
0
 def testCompressionOutputDTypeMismatch(self):
   element = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   compressed = compression_ops.compress(element)
   with self.assertRaisesRegex(errors.FailedPreconditionError,
                               "but got a tensor of type string"):
     uncompressed = compression_ops.uncompress(
         compressed, structure.type_spec_from_value(0))
     self.evaluate(uncompressed)
Exemplo n.º 5
0
  def testDatasetCompression(self, element):
    element = element._obj

    dataset = dataset_ops.Dataset.from_tensors(element)
    element_spec = dataset.element_spec

    dataset = dataset.map(lambda *x: compression_ops.compress(x))
    dataset = dataset.map(lambda x: compression_ops.uncompress(x, element_spec))
    self.assertDatasetProduces(dataset, [element])
Exemplo n.º 6
0
 def testCompressionInputShapeMismatch(self):
   element = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   compressed = compression_ops.compress(element)
   compressed = [compressed, compressed]
   error = (
       errors.InvalidArgumentError
       if context.executing_eagerly() else ValueError)
   with self.assertRaises(error):
     uncompressed = compression_ops.uncompress(
         compressed, structure.type_spec_from_value(0))
     self.evaluate(uncompressed)
    def _apply_fn(dataset):  # pylint: disable=missing-docstring
        external_state_policy = dataset.options(
        ).experimental_external_state_policy
        if external_state_policy is None:
            external_state_policy = ExternalStatePolicy.WARN

        uncompressed_spec = dataset.element_spec
        # Compress the dataset elements to reduce the amount of data that needs to
        # be sent over the network.
        # TODO(b/157105111): Make this an autotuned parallel map when we have a way
        # to limit memory usage.
        dataset = dataset.map(lambda *x: compression_ops.compress(x))
        # Prefetch one compressed element to reduce latency when requesting data
        # from tf.data workers.
        # TODO(b/157105111): Set this to autotune when we have a way to limit
        # memory usage
        dataset = dataset.prefetch(1)
        # Apply options so that the dataset executed in the tf.data service will
        # be optimized and support autotuning.
        dataset = dataset._apply_options()  # pylint: disable=protected-access
        dataset_id = gen_experimental_dataset_ops.register_dataset(
            dataset._variant_tensor,  # pylint: disable=protected-access
            address=address,
            protocol=protocol,
            external_state_policy=external_state_policy.value)
        dataset = _DataServiceDataset(
            input_dataset=dataset,
            dataset_id=dataset_id,
            processing_mode=processing_mode,
            address=address,
            protocol=protocol,
            job_name=job_name,
            max_outstanding_requests=max_outstanding_requests,
            task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
        # TODO(b/157105111): Make this an autotuned parallel map when we have a way
        # to limit memory usage.
        # The value 16 is chosen based on experience with pipelines that require
        # more than 8 parallel calls to prevent this stage from being a bottleneck.
        dataset = dataset.map(lambda x: compression_ops.uncompress(
            x, output_spec=uncompressed_spec),
                              num_parallel_calls=16)

        # Disable autosharding for shared jobs.
        if job_name:
            options = dataset_ops.Options()
            options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
            dataset = dataset.with_options(options)
        return dataset
Exemplo n.º 8
0
def _from_dataset_id(processing_mode,
                     service,
                     dataset_id,
                     element_spec,
                     job_name=None,
                     consumer_index=None,
                     num_consumers=None,
                     max_outstanding_requests=None,
                     task_refresh_interval_hint_ms=None,
                     data_transfer_protocol=None,
                     compression="AUTO"):
  """Creates a dataset which reads data from the tf.data service.

  This transformation is similar to `from_dataset_id`, but supports additional
  parameters which we do not yet want to add to the public Python API.

  Args:
    processing_mode: A string specifying the policy for how data should be
      processed by tf.data workers. Can be either "parallel_epochs" to have each
      tf.data worker process a copy of the dataset, or "distributed_epoch" to
      split a single iteration of the dataset across all the workers.
    service: A string indicating how to connect to the tf.data service. The
      string should be in the format "<protocol>://<address>", e.g.
      "grpc://localhost:5000".
    dataset_id: The id of the dataset to read from. This id is returned by
      `register_dataset` when the dataset is registered with the tf.data
      service.
    element_spec: A nested structure of `tf.TypeSpec`s representing the type of
      elements produced by the dataset. Use `tf.data.Dataset.element_spec` to
      see the element spec for a given dataset.
    job_name: (Optional.) The name of the job. This argument makes it possible
      for multiple datasets to share the same job. The default behavior is that
      the dataset creates anonymous, exclusively owned jobs.
    consumer_index: (Optional.) The index of the consumer in the range from `0`
      to `num_consumers`. Must be specified alongside `num_consumers`. When
      specified, consumers will read from the job in a strict round-robin order,
      instead of the default first-come-first-served order.
    num_consumers: (Optional.) The number of consumers which will consume from
      the job. Must be specified alongside `consumer_index`. When specified,
      consumers will read from the job in a strict round-robin order, instead of
      the default first-come-first-served order. When `num_consumers` is
      specified, the dataset must have infinite cardinality to prevent a
      producer from running out of data early and causing consumers to go out of
      sync.
    max_outstanding_requests: (Optional.) A limit on how many elements may be
      requested at the same time. You can use this option to control the amount
      of memory used, since `distribute` won't use more than `element_size` *
      `max_outstanding_requests` of memory.
    task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
      dispatcher for task changes.
    data_transfer_protocol: (Optional.) The protocol to use for transferring
      data with the tf.data service.
    compression: An indication of how the dataset's elements were compressed, so
      that `from_dataset_id` can uncompress them if necessary.

  Returns:
    A `tf.data.Dataset` which reads from the tf.data service.
  """
  ProcessingMode.validate(processing_mode)
  valid_compressions = [COMPRESSION_AUTO, COMPRESSION_NONE]
  if compression not in valid_compressions:
    raise ValueError(
        "Invalid compression argument: {}. Must be one of {}".format(
            compression, valid_compressions))
  if job_name is not None:
    if not isinstance(job_name, six.string_types):
      raise ValueError("job_name must be a string, but job_name was of type "
                       "{0}. job_name={1}".format(type(job_name), job_name))
    if not job_name:
      raise ValueError("job_name must not be empty")
  if element_spec is None:
    raise ValueError("element_spec must not be None")
  protocol, address = _parse_service(service)

  # If we compress, the data service side dataset will produce scalar variants.
  data_service_element_spec = (
      tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
      if compression == COMPRESSION_AUTO else element_spec)

  dataset = _DataServiceDataset(
      dataset_id=dataset_id,
      processing_mode=processing_mode,
      address=address,
      element_spec=data_service_element_spec,
      protocol=protocol,
      data_transfer_protocol=data_transfer_protocol,
      job_name=job_name,
      consumer_index=consumer_index,
      num_consumers=num_consumers,
      max_outstanding_requests=max_outstanding_requests,
      task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
  if compression == COMPRESSION_AUTO:
    dataset = dataset.map(
        lambda x: compression_ops.uncompress(x, output_spec=element_spec),
        num_parallel_calls=dataset_ops.AUTOTUNE)

  # Disable autosharding for shared jobs.
  if job_name:
    options = dataset_ops.Options()
    options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
    dataset = dataset.with_options(options)
  return dataset
Exemplo n.º 9
0
 def testCompressionInputDTypeMismatch(self):
   uncompressed = list(range(10))
   with self.assertRaises(TypeError):
     uncompressed = compression_ops.uncompress(
         uncompressed, structure.type_spec_from_value(uncompressed))
     self.evaluate(uncompressed)
Exemplo n.º 10
0
def _from_dataset_id(processing_mode,
                     service,
                     dataset_id,
                     element_spec,
                     job_name=None,
                     consumer_index=None,
                     num_consumers=None,
                     max_outstanding_requests=None,
                     max_request_pipelining_per_worker=1,
                     task_refresh_interval_hint_ms=None,
                     data_transfer_protocol=None,
                     compression="AUTO",
                     target_workers="AUTO"):
    """Creates a dataset which reads data from the tf.data service.

  This transformation is similar to `from_dataset_id`, but supports additional
  parameters which we do not yet want to add to the public Python API.

  Args:
    processing_mode: A string specifying the policy for how data should be
      processed by tf.data workers. Can be either "parallel_epochs" to have each
      tf.data worker process a copy of the dataset, or "distributed_epoch" to
      split a single iteration of the dataset across all the workers.
    service: A string or a tuple indicating how to connect to the tf.data
      service. If it's a string, it should be in the format
      `[<protocol>://]<address>`, where `<address>` identifies the dispatcher
      address and `<protocol>` can optionally be used to override the default
      protocol to use. If it's a tuple, it should be (protocol, address).
    dataset_id: The id of the dataset to read from. This id is returned by
      `register_dataset` when the dataset is registered with the tf.data
      service.
    element_spec: A nested structure of `tf.TypeSpec`s representing the type of
      elements produced by the dataset. This argument is only required inside a
      tf.function. Use `tf.data.Dataset.element_spec` to get the element spec
      for a given dataset.
    job_name: (Optional.) The name of the job. If provided, it must be a
      non-empty string or tensor. This argument makes it possible
      for multiple datasets to share the same job. The default behavior is that
      the dataset creates anonymous, exclusively owned jobs.
    consumer_index: (Optional.) The index of the consumer in the range from `0`
      to `num_consumers`. Must be specified alongside `num_consumers`. When
      specified, consumers will read from the job in a strict round-robin order,
      instead of the default first-come-first-served order.
    num_consumers: (Optional.) The number of consumers which will consume from
      the job. Must be specified alongside `consumer_index`. When specified,
      consumers will read from the job in a strict round-robin order, instead of
      the default first-come-first-served order. When `num_consumers` is
      specified, the dataset must have infinite cardinality to prevent a
      producer from running out of data early and causing consumers to go out of
      sync.
    max_outstanding_requests: (Optional.) A limit on how many elements may be
      requested at the same time. You can use this option to control the amount
      of memory used, since `distribute` won't use more than `element_size` *
      `max_outstanding_requests` of memory.
    task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
      dispatcher for task changes.
    data_transfer_protocol: (Optional.) The protocol to use for transferring
      data with the tf.data service. By default, data is transferred using gRPC.
    compression: An indication of how the dataset's elements were compressed, so
      that `from_dataset_id` can uncompress them if necessary.
    target_workers: (Optional.) Which workers to read from. If `"AUTO"`, tf.data
      runtime decides which workers to read from. If `"ANY"`, reads from any
      tf.data service workers. If `"LOCAL"`, only reads from local in-processs
      tf.data service workers. `"AUTO"` works well for most cases, while users
      can specify other targets. For example, `"LOCAL"` helps avoid RPCs and
      data copy if every TF worker colocates with a tf.data service worker.
      Defaults to `"AUTO"`.

    EASL:
    max_request_pipelining_per_worker: (Optional.) We add this parameter to increase
      the number of parallel request a client can send to a single worker. Defaults
      to 1 to default to original behaviour.

  Returns:
    A `tf.data.Dataset` which reads from the tf.data service.
  """
    ProcessingMode.validate(processing_mode)
    valid_compressions = [COMPRESSION_AUTO, COMPRESSION_NONE]
    if isinstance(service, tuple):
        protocol, address = service
    else:
        protocol, address = _parse_service(service)

    if compression not in valid_compressions:
        raise ValueError(
            "Invalid compression argument: {}. Must be one of {}".format(
                compression, valid_compressions))
    if job_name is not None:
        if not isinstance(job_name, six.string_types) and not isinstance(
                job_name, ops.Tensor):
            raise ValueError(
                "job_name must be a string or Tensor, but job_name was of type "
                "{0}. job_name={1}".format(type(job_name), job_name))

    if element_spec is None:
        if not context.executing_eagerly():
            raise ValueError(
                "In graph mode element_spec must be provided manually.")

        dataset_id_val = tensor_util.constant_value(dataset_id)
        try:
            encoded_spec = _pywrap_server_lib.TF_DATA_GetElementSpec(
                dataset_id_val, address, protocol)

        except NotImplementedError as err:
            raise ValueError(
                "The tf.data service is running an earlier version of "
                "TensorFlow that requires specifying `element_spec` as "
                "an argument to `from_dataset_id`. Please either supply "
                "an element spec or update the tf.data service to the "
                "latest version.") from err

        except RuntimeError as err:
            raise ValueError(
                "Failed to fetch element spec for dataset id " +
                str(dataset_id_val) + " from tf.data service. If the "
                "dataset was registered in graph mode or inside a "
                "tf.function, the `element_spec` must be specified as "
                "an argument to `from_dataset_id`.") from err

        struct_pb = nested_structure_coder.struct_pb2.StructuredValue()
        struct_pb.ParseFromString(encoded_spec)
        coder = nested_structure_coder.StructureCoder()
        element_spec = coder.decode_proto(struct_pb)

    # If we compress, the data service side dataset will produce scalar variants.
    data_service_element_spec = (tensor_spec.TensorSpec(
        shape=(), dtype=dtypes.variant) if compression == COMPRESSION_AUTO else
                                 element_spec)

    dataset = _DataServiceDataset(
        dataset_id=dataset_id,
        processing_mode=processing_mode,
        address=address,
        element_spec=data_service_element_spec,
        protocol=protocol,
        data_transfer_protocol=data_transfer_protocol,
        job_name=job_name,
        consumer_index=consumer_index,
        num_consumers=num_consumers,
        max_outstanding_requests=max_outstanding_requests,
        max_request_pipelining_per_worker=max_request_pipelining_per_worker,
        task_refresh_interval_hint_ms=task_refresh_interval_hint_ms,
        target_workers=target_workers)
    if compression == COMPRESSION_AUTO:
        dataset = dataset.map(
            lambda x: compression_ops.uncompress(x, output_spec=element_spec),
            num_parallel_calls=dataset_ops.AUTOTUNE)

    # Disable autosharding for shared jobs.
    if job_name is not None:
        options = dataset_ops.Options()
        options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
        dataset = dataset.with_options(options)
    return dataset
Exemplo n.º 11
0
def _from_dataset_id(processing_mode,
                     service,
                     dataset_id,
                     element_spec,
                     job_name=None,
                     max_outstanding_requests=None,
                     task_refresh_interval_hint_ms=None):
    """Creates a dataset which reads data from the tf.data service.

  This transformation is similar to `from_dataset_id`, but supports additional
  parameters which we do not yet want to add to the public Python API.

  Args:
    processing_mode: A string specifying the policy for how data should be
      processed by tf.data workers. Currently, the only supported value is
      "parallel_epochs".
    service: A string indicating how to connect to the tf.data service. The
      string should be in the format "<protocol>://<address>", e.g.
      "grpc://localhost:5000".
    dataset_id: The id of the dataset to read from. This id is returned by
      `register_dataset` when the dataset is registered with the tf.data
      service.
    element_spec: A nested structure of `tf.TypeSpec`s representing the type of
      elements produced by the dataset. Use `tf.data.Dataset.element_spec` to
      see the element spec for a given dataset.
    job_name: (Optional.) The name of the job. This argument makes it possible
      for multiple datasets to share the same job. The default behavior is that
      the dataset creates anonymous, exclusively owned jobs.
    max_outstanding_requests: (Optional.) A limit on how many elements may be
      requested at the same time. You can use this option to control the amount
      of memory used, since `distribute` won't use more than `element_size` *
      `max_outstanding_requests` of memory.
    task_refresh_interval_hint_ms: (Optional.) A hint for how often to query the
      dispatcher for task changes.

  Returns:
    A `tf.data.Dataset` which reads from the tf.data service.
  """
    ProcessingMode.validate(processing_mode)
    if job_name is not None:
        if not isinstance(job_name, six.string_types):
            raise ValueError(
                "job_name must be a string, but job_name was of type "
                "{0}. job_name={1}".format(type(job_name), job_name))
        if not job_name:
            raise ValueError("job_name must not be empty")
    if element_spec is None:
        raise ValueError("element_spec must not be None")
    protocol, address = _parse_service(service)

    dataset = _DataServiceDataset(
        dataset_id=dataset_id,
        processing_mode=processing_mode,
        address=address,
        protocol=protocol,
        job_name=job_name,
        max_outstanding_requests=max_outstanding_requests,
        task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
    # to limit memory usage.
    # The value 16 is chosen based on experience with pipelines that require
    # more than 8 parallel calls to prevent this stage from being a bottleneck.
    dataset = dataset.map(
        lambda x: compression_ops.uncompress(x, output_spec=element_spec),
        num_parallel_calls=16)

    # Disable autosharding for shared jobs.
    if job_name:
        options = dataset_ops.Options()
        options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
        dataset = dataset.with_options(options)
    return dataset