Пример #1
0
  def testParameterServerMultiExecutors(self):
    context.update_server_def(server_def=self.server_def_s1_s2_s3_s4)

    with ops.device(self.device_t1):
      v1 = variables.Variable(initial_value=0.)
    with ops.device(self.device_t2):
      v2 = variables.Variable(initial_value=10.)

    @def_function.function
    def worker_fn():
      x1 = v1.read_value()
      x2 = v2.read_value()
      grad = (x1 + x2) * 0.1
      v1.assign_add(grad)
      v2.assign_sub(grad)
      return v1 + v2

    worker_fn.get_concrete_function()

    executor_t3 = executor.new_executor(enable_async=False)
    executor_t4 = executor.new_executor(enable_async=False)

    num_calls = 10
    self._coord = coordinator.Coordinator()

    def thread_fn(executor_obj, device, results):
      with self._coord.stop_on_exception():
        for i in range(num_calls):
          with context.executor_scope(executor_obj):
            with ops.device(device):
              results[i] = worker_fn()

    def update_server_def_fn():
      with self._coord.stop_on_exception():
        for _ in range(30):
          context.update_server_def(self.server_def_s1_s2_s3_s4)

    t3_results = [None] * num_calls
    t4_results = [None] * num_calls
    threads = []
    threads.append(
        threading.Thread(
            target=thread_fn, args=(executor_t3, self.device_t3, t3_results)))
    threads.append(
        threading.Thread(
            target=thread_fn, args=(executor_t4, self.device_t4, t4_results)))
    threads.append(threading.Thread(target=update_server_def_fn))
    for t in threads:
      t.start()
    self._coord.join(threads)

    # Cannot assert individual values since the results are non-deterministic.
    # By summing up the value we ensure that there are all reasonable and valid
    # numbers (not `None` or `NaN`).
    total = np.sum(t3_results + t4_results)
    self.assertGreater(total, 0)
Пример #2
0
    def testTwoExecutors(self):
        # Run an op on the main executor that by default uses StreamingEnqueue to
        # schedule the op to run on the remote async executor. This op produces an
        # error, i.e., division by zero, but will not be immediately caught due to
        # streaming enqueue.
        with ops.device('job:worker/replica:0/task:0/device:CPU:0'):
            a = constant_op.constant(3)
            b = constant_op.constant(0)
            math_ops.div(a, b)

        # Run another op using another executor that disables streaming enqueue,
        # which would run the op using the tf_compute thread pool in the remote
        # worker. Since the op is not run in the same remotes async executor, it
        # will not carry back that error produced by the op above, even though this
        # op is executed synchronously.
        with context.executor_scope(
                executor.new_executor(enable_async=False,
                                      enable_streaming_enqueue=False)):
            with ops.device('job:worker/replica:0/task:0/device:CPU:0'):
                c = constant_op.constant(4)
                d = constant_op.constant(2)
                self.assertEqual(math_ops.div(c, d).numpy(), 2)

        # Sync on the context to force to catch the error produced by the first op.
        with self.assertRaises(errors.InvalidArgumentError) as cm:
            context.async_wait()
        self.assertIn('division by zero', cm.exception.message)
Пример #3
0
    def __call__(self, device, token, args):
        """Passes `args` to `self._func`, which is executed eagerly."""

        func_executor = executor.new_executor(context.is_async())
        with context.executor_scope(func_executor):
            with context.eager_mode(), backprop.GradientTape() as tape:
                # Only watch tensors with a floating dtype.
                for tensor in args:
                    for t in nest.flatten(tensor):
                        if t.dtype.is_floating:
                            tape.watch(t)
                ret = self._func(*args)
                # Use tf.identity to copy the returned tensors to device if necessary.
                with ops.device(device):
                    if isinstance(ret, (tuple, list)):
                        outputs = [
                            array_ops.identity(self._convert(x, dtype=dtype))
                            for (x, dtype) in zip(ret, self._out_dtypes)
                        ]
                    elif ret is None:
                        outputs = None
                    else:
                        outputs = array_ops.identity(
                            self._convert(ret, dtype=self._out_dtypes[0]))
            tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
            return outputs
        func_executor.wait()
Пример #4
0
  def __init__(self, worker_index, device_name, cluster):
    self.worker_index = worker_index
    self.device_name = device_name
    self.executor = executor.new_executor(enable_async=False)
    self.failure_handler = cluster.failure_handler
    self._cluster = cluster
    self._resource_remote_value_refs = []

    # Worker threads need to start after `Worker`'s initialization.
    threading.Thread(target=self._process_queue,
                     name="WorkerClosureProcessingLoop-%d" % self.worker_index,
                     daemon=True).start()
Пример #5
0
  def testCancelGetNextWithDevice(self, cls):
    ping = data_flow_ops.FIFOQueue(capacity=2, dtypes=dtypes.int64)
    pong = data_flow_ops.FIFOQueue(capacity=2, dtypes=dtypes.int64)

    @def_function.function
    def map_fn(v):
      ball = ping.dequeue()
      with ops.control_dependencies([pong.enqueue(ball)]):
        return v + ping.dequeue()

    dataset = dataset_ops.Dataset.range(10)
    dataset = dataset.map(map_fn)

    # We need to set prefetch_buffer_size=0 so that we can cancel the
    # MultiDeviceIteratorGetNextFromShardOp from eager. If
    # prefetch_buffer_size>0, that op runs in the background threads of the
    # prefetch and can only be cancelled by deleting the iterator.
    multi_device_iterator = cls(
        dataset, [self._devices[1], self._devices[2]], prefetch_buffer_size=0)

    @def_function.function
    def get_next_device1():
      return multi_device_iterator.get_next(self._devices[1])

    async_executor = executor.new_executor(enable_async=True)
    with context.executor_scope(async_executor):
      cancel_mgr = cancellation.CancellationManager()
      cancel_mgr.get_cancelable_function(
          get_next_device1.get_concrete_function())()
    # Make sure we cancel in the middle of get_next.
    ping.enqueue(0)
    pong.dequeue()
    cancel_mgr.start_cancel()
    with self.assertRaises(errors.CancelledError):
      async_executor.wait()
    # Note that fetching from upstream iterator is not cancelled with the
    # cancellation of get_next.
    ping.enqueue(0)

    # Cancelling a get_next on one device shouldn't cancel the
    # multi_device_iterator and iterators on other devices.
    ping.enqueue(0)
    ping.enqueue(0)
    self.assertEqual(1,
                     multi_device_iterator.get_next(self._devices[2]).numpy())
    # FIXME(b/209534797): Workaround an asan error caused by this test.
    # Remove the dangling reference from tf.function to ensure queue objects
    # are not freed before they are flushed.
    import gc  # pylint: disable=g-import-not-at-top
    del get_next_device1
    gc.collect()
Пример #6
0
    def testPyFunctionAsync(self):
        def simple_fn(v):
            one = constant_op.constant(1.)
            return v + one

        @def_function.function
        def test_fn(v):
            return script_ops.eager_py_func(simple_fn, [v], dtypes.float32)

        async_executor = executor.new_executor(enable_async=True)
        with context.executor_scope(async_executor):
            test_var = variables.Variable(2.)
            self.assertAllEqual(test_fn(test_var), 3.0)
        async_executor.wait()
Пример #7
0
    def __init__(self,
                 devices,
                 group_size,
                 collective_keys=None,
                 communication=CollectiveCommunication.AUTO):
        """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
        if group_size % len(devices) > 0:
            raise ValueError(
                "group_size must be divisible by the number of devices.")

        self._devices = tuple(device_util.canonicalize(d) for d in devices)
        self._group_size = group_size
        self._collective_keys = (collective_keys
                                 or cross_device_utils.CollectiveKeys())
        self._communication = communication
        # This lock guards all collective launches, i.e. calls to
        # cross_device_utils.build_collectve_*.
        #
        # In a multi threaded eager program we need to ensure different groups of
        # collectives don't interleave each other, otherwise there couuld be
        # deadlocks. E.g. if two user threads both are launching collectives:
        #   user-thread-0  device0                 device1
        #   user-thread-1          device0 device1
        # In eager mode, we use one executor per device. Executors use single FIFO
        # queues, so the above launch sequences end up with the following queues:
        #   device-0  collective-0  collective-1
        #   device-1  collective-1  collective-0
        # This deadlocks since neither collective is able to finish.
        self._lock = threading.Lock()

        # Collective ops requires all devices to participate and is blocking. In
        # eager, we need one async executor for each device to be able to launch
        # them altogether. Note that async doesn't imply concurrency. Within an
        # async executor operations are still executed sequentially. In graph or
        # function building, the executors are not used.
        self._executors = []
        for _ in range(len(devices)):
            self._executors.append(executor.new_executor(enable_async=True))

        super(CollectiveAllReduce, self).__init__()
Пример #8
0
  def testRemoteFunctionCancellation(self):
    context._reset_context()
    logical_devices = []
    logical_devices.append(context.LogicalDeviceConfiguration())
    logical_devices.append(context.LogicalDeviceConfiguration())
    framework_config.set_logical_device_configuration(
        framework_config.list_physical_devices("CPU")[0], logical_devices)

    @function.Defun(dtypes.float32)
    def _remote_fn(v):
      # We run two collectives here to make sure we cancel in the middle of the
      # RemoteCall. The second one should never finish.
      anchor = collective_ops.all_reduce_v2(
          v, group_size=2, group_key=1, instance_key=1)
      with ops.control_dependencies([anchor]):
        return collective_ops.all_reduce_v2(
            v, group_size=2, group_key=1, instance_key=2)

    @eager_def_function.function
    def run():
      with ops.device("/cpu:0"):
        return functional_ops.remote_call(
            args=[constant_op.constant([1.])],
            Tout=[dtypes.float32],
            f=_remote_fn,
            target="/cpu:1")[0]

    async_executor = executor.new_executor(enable_async=True)
    cancel_mgr = cancellation.CancellationManager()
    with context.executor_scope(async_executor):
      # This should never finish.
      cancel_mgr.get_cancelable_function(run.get_concrete_function())()
    with ops.device("/cpu:0"):
      collective_ops.all_reduce_v2([1.],
                                   group_size=2,
                                   group_key=1,
                                   instance_key=1)
    cancel_mgr.start_cancel()
    with self.assertRaises(errors.CancelledError):
      async_executor.wait()
Пример #9
0
  def __init__(self,
               devices,
               group_size,
               collective_keys=None,
               communication=CollectiveCommunication.AUTO):
    """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
    if group_size % len(devices) > 0:
      raise ValueError("group_size must be divisible by the number of devices.")

    self._devices = tuple(device_util.canonicalize(d) for d in devices)
    self._group_size = group_size
    self._collective_keys = (collective_keys or
                             cross_device_utils.CollectiveKeys())
    self._communication = communication
    # In a multi threaded eager program we need to ensure different groups of
    # collectives don't interleave each other, otherwise there will be deadlock.
    self._lock = threading.Lock()

    # Collective ops requires all devices to participate and is blocking. In
    # eager, we need one async executor for each device to be able to launch
    # them altogether. Note that async doesn't imply concurrency. Within an
    # async executor operations are still executed sequentially. In graph or
    # function building, the executors are not used.
    self._executors = []
    for _ in range(len(devices)):
      self._executors.append(executor.new_executor(enable_async=True))

    super(CollectiveAllReduce, self).__init__()