def testRemoteDeviceInPartitionedCallOp(self):
    workers, _ = test_util.create_local_cluster(2, 0)

    worker0_device = "/job:worker/replica:0/task:0/cpu:0"
    worker1_device = "/job:worker/replica:0/task:1/cpu:0"

    @eager_def_function.function
    def f(a, b):
      return a + b

    with session.Session(workers[0].target) as sess:
      with ops.device(worker0_device):
        a = variable_scope.get_variable(
            "a", initializer=constant_op.constant(1.), use_resource=True)
      with ops.device(worker1_device):
        b = variable_scope.get_variable(
            "b", initializer=constant_op.constant(1.), use_resource=True)

      sess.run(variables.global_variables_initializer())

    config = config_pb2.ConfigProto()
    config.experimental.share_cluster_devices_in_session = True

    with session.Session(workers[0].target, config=config) as sess:
      res = sess.run(f(a, b))

    self.assertEqual(res, 2)
  def testDistributedOOM(self):
    if not test.is_gpu_available():
      return
    ops.reset_default_graph()

    workers, _ = test_util.create_local_cluster(2, 0)

    with ops.device('/job:worker/replica:0/task:0/gpu:0'):
      a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
    with ops.device('/job:worker/replica:0/task:1/gpu:0'):
      b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
      c = a * b

    try:
      with session.Session(workers[1].target) as sess:
        sess.run(c, options=config_pb2.RunOptions(
            report_tensor_allocations_upon_oom=True))
    except Exception as e:  # pylint: disable=broad-except
      exception_str = '%s' % e
      # test_random2 is reported because it's allocated in worker 1.
      self.assertTrue('Current usage from device: '
                      '/job:worker/replica:0/task:1/device:GPU:0, '
                      'allocator: GPU_0_bfc' in exception_str)
      mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
                      exception_str)
      self.assertGreater(float(mat.group(1)), 0.0)
      # test_random1 is not reported because it's allocated in worker 0.
      mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
                      exception_str)
      self.assertTrue(mat is None)
  def testImplicitDisposeParallelMapDataset(self):
    # Tests whether a parallel map dataset will be cleaned up correctly when
    # the pipeline does not run it until exhaustion.
    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
    # RepeatDataset(None) -> PrefetchDataset(100).
    worker, _ = test_util.create_local_cluster(1, 1)

    components = (np.arange(1000),
                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
                  np.array(37.0) * np.arange(1000))

    def _map_fn(x, y, z):
      return math_ops.square(x), math_ops.square(y), math_ops.square(z)

    dataset = (
        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
        .repeat(None).prefetch(10000))

    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with session.Session(worker[0].target) as sess:
      self.evaluate(init_op)
      for _ in range(3):
        sess.run(get_next)
  def testRemoteFunction(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    @function.Defun(dtypes.int32, dtypes.int32)
    def _remote_fn(a, b):
      return math_ops.multiply(a, b)

    with ops.device("/job:ps/task:0"):
      a = variables.Variable(2, dtype=dtypes.int32)
      b = variables.Variable(3, dtype=dtypes.int32)

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      remote_op = functional_ops.remote_call(
          args=[a, b],
          Tout=[dtypes.int32],
          f=_remote_fn,
          target="/job:worker/replica:0/task:0/cpu:1")

    with session.Session(worker[0].target) as sess:
      self.evaluate(variables.global_variables_initializer())
      mul = self.evaluate(remote_op)
      self.assertEqual(mul, [6])
  def testCaptureHashTableInSharedIterator(self):
    worker, _ = test_util.create_local_cluster(1, 1)

    # NOTE(mrry): We must use the V2 variants of `HashTable`
    # etc. because these produce a `tf.resource`-typed output that is
    # compatible with the in-graph function implementation.
    default_val = -1
    keys = constant_op.constant(["brain", "salad", "surgery"])
    values = constant_op.constant([0, 1, 2], dtypes.int64)
    table = lookup_ops.HashTable(
        lookup_ops.KeyValueTensorInitializer(keys, values),
        default_val,
        shared_name="shared_table")

    input_sentences = dataset_ops.Dataset.from_tensor_slices(
        ["brain brain tank salad surgery", "surgery brain"])

    iterator = (
        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
            table.lookup)
        .make_initializable_iterator(shared_name="shared_iterator"))
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with session.Session(worker[0].target) as sess:
      self.evaluate(table.initializer)
      self.evaluate(init_op)
      self.assertAllEqual([0, 0, -1, 1, 2], self.evaluate(get_next))

    with session.Session(worker[0].target) as sess:
      self.assertAllEqual([2, 0], self.evaluate(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
  def testRemoteIteratorUsingRemoteCallOp(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
                                   "/job:worker/replica:0/task:0/cpu:1",
                                   worker[0].target)
 def setUpClass(cls):
   # We have to create a global in-process cluster because once an in-process
   # tensorflow server is created, there is no way to terminate it. Please see
   # multi_worker_test_base.py for more details.
   cls._workers, cls._ps = test_util.create_local_cluster(
       NUM_WORKERS, num_ps=NUM_PS)
   cls._cluster_spec = {
       WORKER: [_bytes_to_str(w.target) for w in cls._workers],
       PS: [_bytes_to_str(ps.target) for ps in cls._ps]
   }
 def _test_device_and_input_device_are_colocated(self, strategy):
   if context.executing_eagerly():
     self.skipTest(
         "cross-device tests are not supported with eager execution.")
   workers, _ = test_util.create_local_cluster(2, 0)
   inputs = strategy.make_input_fn_iterator(
       lambda _: dataset_ops.Dataset.range(5))
   comm_fn = lambda x: x + 1
   run_op = strategy.experimental_run(comm_fn, inputs)
   with session_lib.Session(target=workers[1].target) as sess:
     sess.run(inputs.initialize())
     sess.run(run_op)
示例#9
0
 def testSerializeListWithUnknownRank(self):
   worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
   with ops.Graph().as_default(), session.Session(target=worker.target):
     with ops.device("/job:worker"):
       t = constant_op.constant([[1.0], [2.0]])
       l = list_ops.tensor_list_from_tensor(t, element_shape=None)
     with ops.device("/job:ps"):
       l_ps = array_ops.identity(l)
       element_shape = list_ops.tensor_list_element_shape(
           l_ps, shape_type=dtypes.int32)
     with ops.device("/job:worker"):
       element_shape = array_ops.identity(element_shape)
     self.assertEqual(self.evaluate(element_shape), -1)
示例#10
0
 def testSerialize(self):
   worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
   with ops.Graph().as_default(), session.Session(target=worker.target):
     with ops.device("/job:worker"):
       t = constant_op.constant([[1.0], [2.0]])
       l = list_ops.tensor_list_from_tensor(t, element_shape=[1])
     with ops.device("/job:ps"):
       l_ps = array_ops.identity(l)
       l_ps, e = list_ops.tensor_list_pop_back(
           l_ps, element_dtype=dtypes.float32)
     with ops.device("/job:worker"):
       worker_e = array_ops.identity(e)
     self.assertAllEqual(self.evaluate(worker_e), [2.0])
  def testRemoteIteratorUsingRemoteCallOp(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
      iterator_3 = dataset_3.make_one_shot_iterator()
      iterator_3_handle = iterator_3.string_handle()

    @function.Defun(dtypes.string)
    def _remote_fn(h):
      remote_iterator = dataset_ops.Iterator.from_string_handle(
          h, dataset_3.output_types, dataset_3.output_shapes)
      return remote_iterator.get_next()

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
      remote_op = functional_ops.remote_call(
          args=[iterator_3_handle],
          Tout=[dtypes.int32],
          f=_remote_fn,
          target=target_placeholder)

    with session.Session(worker[0].target) as sess:
      elem = sess.run(
          remote_op,
          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
      self.assertEqual(elem, [1])
      # Fails when target is cpu:0 where the resource is not located.
      with self.assertRaises(errors.InvalidArgumentError):
        sess.run(
            remote_op,
            feed_dict={
                target_placeholder: "/job:worker/replica:0/task:0/cpu:0"
            })
      elem = sess.run(
          remote_op,
          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
      self.assertEqual(elem, [2])
      elem = sess.run(
          remote_op,
          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
      self.assertEqual(elem, [3])
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(
            remote_op,
            feed_dict={
                target_placeholder: "/job:worker/replica:0/task:0/cpu:1"
            })
示例#12
0
 def testSerializeListWithInvalidTensors(self):
   worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
   with ops.Graph().as_default(), session.Session(target=worker.target):
     with ops.device("/job:worker"):
       l = list_ops.tensor_list_reserve(
           element_dtype=dtypes.float32, element_shape=[], num_elements=2)
       l = list_ops.tensor_list_set_item(l, 0, 1.)
     with ops.device("/job:ps"):
       l_ps = array_ops.identity(l)
       l_ps = list_ops.tensor_list_set_item(l_ps, 1, 2.)
       t = list_ops.tensor_list_stack(l_ps, element_dtype=dtypes.float32)
     with ops.device("/job:worker"):
       worker_t = array_ops.identity(t)
     self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0])
  def setUpClass(cls):
    """Create a local cluster with 2 workers."""
    num_workers = 2
    # Leave some memory for cuda runtime.
    gpu_mem_frac = 0.7 / num_workers
    default_config = config_pb2.ConfigProto()
    default_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac

    # The local cluster takes some portion of the local GPUs and there is no way
    # for the cluster to terminate unless using multiple processes. Therefore,
    # we have to only create only one cluster throughout a test process.
    workers, _ = test_util.create_local_cluster(
        num_workers, num_ps=0, worker_config=default_config)
    cls._master_target = workers[0].target
 def _test_device_and_input_device_are_colocated_with_function(self, strategy):
   if context.executing_eagerly():
     self.skipTest(
         "cross-device tests are not supported with eager execution.")
   workers, _ = test_util.create_local_cluster(2, 0)
   inputs = strategy.make_input_fn_iterator(
       lambda _: dataset_ops.Dataset.range(5))
   comm_fn = lambda x: x + 1
   experimental_run = def_function.function()(strategy.experimental_run)
   with ops.device("/job:worker/replica:0/task:1/device:CPU:0"):
     # The tf.function must be defined on the right device as well.
     run_op = experimental_run(comm_fn, inputs)
   with session_lib.Session(target=workers[1].target) as sess:
     sess.run(inputs.initialize())
     sess.run(run_op)
  def testRemoteIteratorWithoutRemoteCallFail(self):
    worker_config = config_pb2.ConfigProto()
    worker_config.device_count["CPU"] = 2
    worker, _ = test_util.create_local_cluster(
        1, 1, worker_config=worker_config)

    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
      iterator_3 = dataset_3.make_one_shot_iterator()
      iterator_3_handle = iterator_3.string_handle()

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      remote_it = dataset_ops.Iterator.from_string_handle(
          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
      get_next_op = remote_it.get_next()

    with session.Session(worker[0].target) as sess:
      with self.assertRaises(errors.InvalidArgumentError):
        sess.run(get_next_op)
 def testSerializeListWithMaxNumElements(self):
   if context.num_gpus():
     # TODO(b/119151861): Enable on GPU.
     return
   worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
   with ops.Graph().as_default(), session.Session(target=worker.target):
     with ops.device("/job:worker"):
       l = list_ops.empty_tensor_list(
           element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2)
       l = list_ops.tensor_list_push_back(l, 1.)
     with ops.device("/job:ps"):
       l_ps = array_ops.identity(l)
       l_ps = list_ops.tensor_list_push_back(l_ps, 2.)
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Tried to push item into a full list"):
       with ops.device("/job:worker"):
         l_worker = array_ops.identity(l_ps)
         l_worker = list_ops.tensor_list_push_back(l_worker, 3.0)
         self.evaluate(l_worker)
示例#17
0
    def testRemoteFunctionCrossProcess(self):
        workers, _ = test_util.create_local_cluster(2, 1)

        @function.Defun(dtypes.float32, dtypes.float32)
        def _remote_fn(a, b):
            return math_ops.multiply(a, b)

        with ops.device("/job:ps/task:0"):
            a = variables.Variable(2, dtype=dtypes.float32)
            b = variables.Variable(3, dtype=dtypes.float32)

        with ops.device("/job:worker/replica:0/task:0/cpu:0"):
            remote_op = functional_ops.remote_call(
                args=[a, b],
                Tout=[dtypes.float32],
                f=_remote_fn,
                target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0

        with session.Session(workers[0].target) as sess:
            sess.run(variables.global_variables_initializer())
            mul = sess.run(remote_op)
            self.assertEqual(mul, 9)
  def testRemoteFunctionCrossProcess(self):
    workers, _ = test_util.create_local_cluster(2, 1)

    @function.Defun(dtypes.float32, dtypes.float32)
    def _remote_fn(a, b):
      return math_ops.multiply(a, b)

    with ops.device("/job:ps/task:0"):
      a = variables.Variable(2, dtype=dtypes.float32)
      b = variables.Variable(3, dtype=dtypes.float32)

    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
      remote_op = functional_ops.remote_call(
          args=[a, b],
          Tout=[dtypes.float32],
          f=_remote_fn,
          target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0

    with session.Session(workers[0].target) as sess:
      self.evaluate(variables.global_variables_initializer())
      mul = self.evaluate(remote_op)
      self.assertEqual(mul, 9)
示例#19
0
def create_in_process_cluster(num_workers, num_ps):
  """Create an in-process cluster that consists of only standard server."""
  # Leave some memory for cuda runtime.
  gpu_mem_frac = 0.7 / num_workers
  worker_config = config_pb2.ConfigProto()
  worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac

  ps_config = config_pb2.ConfigProto()
  ps_config.device_count['GPU'] = 0

  # Create in-process servers. Once an in-process tensorflow server is created,
  # there is no way to terminate it. So we create one cluster per test process.
  # We could've started the server in another process, we could then kill that
  # process to terminate the server. The reasons why we don't want multiple
  # processes are
  # 1) it is more difficult to manage these processes
  # 2) there is something global in CUDA such that if we initialize CUDA in the
  # parent process, the child process cannot initialize it again and thus cannot
  # use GPUs (https://stackoverflow.com/questions/22950047).
  return test_util.create_local_cluster(
      num_workers,
      num_ps=num_ps,
      worker_config=worker_config,
      ps_config=ps_config)
    def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
        workers, _ = test_util.create_local_cluster(2, 1)

        self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
                                       "/job:worker/replica:0/task:1/cpu:0",
                                       workers[0].target)
  def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
    workers, _ = test_util.create_local_cluster(2, 1)

    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
                                   "/job:worker/replica:0/task:1/cpu:0",
                                   workers[0].target)
    def test3Workers1Backup(self):
        num_workers = 3
        replicas_to_aggregate = 2
        num_ps = 2
        workers, _ = create_local_cluster(num_workers=num_workers,
                                          num_ps=num_ps)

        # Creates and returns all the workers.
        sessions, graphs, train_ops = get_workers(num_workers,
                                                  replicas_to_aggregate,
                                                  workers)

        # Chief should have already initialized all the variables.
        var_0_g_1 = graphs[1].get_tensor_by_name("v0:0")
        var_1_g_1 = graphs[1].get_tensor_by_name("v1:0")
        local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0")
        global_step = graphs[1].get_tensor_by_name("global_step:0")

        # The steps should also be initialized.
        self.assertAllEqual(0, sessions[1].run(global_step))
        self.assertAllEqual(0, sessions[1].run(local_step_1))

        # We have initial tokens in the queue so we can call this one by one. After
        # the token queue becomes empty, they should be called concurrently.
        # Here worker 0 and worker 2 finished first.
        sessions[0].run(train_ops[0])
        sessions[2].run(train_ops[2])

        # The global step should have been updated since we only need to collect 2
        # gradients. The variables should now have the new values after the average
        # of the gradients from worker 0/2 are applied.
        while sessions[1].run(global_step) != 1:
            time.sleep(0.01)

        self.assertAllEqual(1, sessions[1].run(global_step))
        self.assertAllClose(0 - (0.1 + 0.5) / 2 * 2.0,
                            sessions[1].run(var_0_g_1))
        self.assertAllClose(1 - (0.9 + 1.3) / 2 * 2.0,
                            sessions[1].run(var_1_g_1))

        # Worker 1 finished later and its gradients will now be dropped as it is
        # stale.
        sessions[1].run(train_ops[1])

        # As shown in the previous test, the local_step for all workers should be
        # still 0 so their next computation will also be dropped.
        sessions[0].run(train_ops[0])
        sessions[1].run(train_ops[1])
        sessions[2].run(train_ops[2])

        # Although the global step should still be 1 as explained above, the local
        # step should now be updated to 1. Just check worker 1 as an example.
        self.assertAllEqual(1, sessions[1].run(global_step))
        self.assertAllEqual(1, sessions[1].run(local_step_1))

        thread_0 = self.checkedThread(target=self._run,
                                      args=(train_ops[0], sessions[0]))
        thread_1 = self.checkedThread(target=self._run,
                                      args=(train_ops[1], sessions[1]))

        # Lets worker 0 execute first.
        # It will wait as we need 2 workers to finish this step and the global step
        # should be still 1.
        thread_0.start()
        self.assertAllEqual(1, sessions[1].run(global_step))

        # Starts worker 1.
        thread_1.start()
        thread_1.join()
        thread_0.join()

        # The global step should now be 2 and the gradients should have been
        # applied again.
        self.assertAllEqual(2, sessions[1].run(global_step))
        self.assertAllClose(-0.6 - (0.1 + 0.3) / 2 * 2.0,
                            sessions[1].run(var_0_g_1))
        self.assertAllClose(-1.2 - (0.9 + 1.1) / 2 * 2.0,
                            sessions[1].run(var_1_g_1))
    def test2Workers(self):
        num_workers = 2
        replicas_to_aggregate = 2
        num_ps = 2
        workers, _ = create_local_cluster(num_workers=num_workers,
                                          num_ps=num_ps)

        # Creates and returns all the workers.
        sessions, graphs, train_ops = get_workers(num_workers,
                                                  replicas_to_aggregate,
                                                  workers)

        # Chief should have already initialized all the variables.
        var_0_g_0 = graphs[0].get_tensor_by_name("v0:0")
        var_1_g_0 = graphs[0].get_tensor_by_name("v1:0")
        local_step_0 = graphs[0].get_tensor_by_name("sync_rep_local_step:0")
        self.assertAllEqual(0.0, sessions[0].run(var_0_g_0))
        self.assertAllEqual(1.0, sessions[0].run(var_1_g_0))
        self.assertAllEqual(0, sessions[0].run(local_step_0))

        # Will just use session 1 to verify all the variables later.
        var_0_g_1 = graphs[1].get_tensor_by_name("v0:0")
        var_1_g_1 = graphs[1].get_tensor_by_name("v1:0")
        var_sparse_g_1 = graphs[1].get_tensor_by_name("v_sparse:0")
        local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0")
        global_step = graphs[1].get_tensor_by_name("global_step:0")

        # The steps should also be initialized.
        self.assertAllEqual(0, sessions[1].run(global_step))
        self.assertAllEqual(0, sessions[1].run(local_step_1))
        self.assertAllClose([[3.0], [4.0]], sessions[1].run(var_sparse_g_1))

        # We have initial tokens in the queue so we can call this one by one. After
        # the first step, this will no longer work as there will be no more extra
        # tokens in the queue.
        sessions[0].run(train_ops[0])
        sessions[1].run(train_ops[1])

        # The global step should have been updated and the variables should now have
        # the new values after the average of the gradients are applied.
        while sessions[1].run(global_step) != 1:
            time.sleep(0.01)

        self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0,
                            sessions[1].run(var_0_g_1))
        self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0,
                            sessions[1].run(var_1_g_1))
        self.assertAllClose([[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]],
                            sessions[1].run(var_sparse_g_1))

        # The local step for both workers should still be 0 because the initial
        # tokens in the token queue are 0s. This means that the following
        # computation of the gradients will be wasted as local_step is smaller than
        # the current global step. However, this only happens once when the system
        # just starts and this is necessary to make the system robust for the case
        # when chief gets restarted by errors/preemption/...
        self.assertAllEqual(0, sessions[0].run(local_step_0))
        self.assertAllEqual(0, sessions[1].run(local_step_1))

        sessions[0].run(train_ops[0])
        sessions[1].run(train_ops[1])
        # Although the global step should still be 1 as explained above, the local
        # step should now be updated to 1. The variables are still the same.
        self.assertAllEqual(1, sessions[1].run(global_step))
        self.assertAllEqual(1, sessions[0].run(local_step_0))
        self.assertAllEqual(1, sessions[1].run(local_step_1))
        self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0,
                            sessions[1].run(var_0_g_1))
        self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0,
                            sessions[1].run(var_1_g_1))

        # At this step, the token queue is empty. So the 2 workers need to work
        # together to proceed.
        threads = []
        threads.append(
            self.checkedThread(target=self._run,
                               args=(train_ops[0], sessions[0])))
        threads.append(
            self.checkedThread(target=self._run,
                               args=(train_ops[1], sessions[1])))

        # The two workers starts to execute the train op.
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

        # The global step should now be 2 and the gradients should have been
        # applied twice.
        self.assertAllEqual(2, sessions[1].run(global_step))
        self.assertAllClose(0 - 2 * (0.1 + 0.3) / 2 * 2.0,
                            sessions[1].run(var_0_g_1))
        self.assertAllClose(1 - 2 * (0.9 + 1.1) / 2 * 2.0,
                            sessions[1].run(var_1_g_1))
 def setUp(self):
     context.context().soft_device_placement = True
     context.context().log_device_placement = True
     workers, _ = test_util.create_local_cluster(2, 0)
     remote.connect_to_remote_host([workers[0].target, workers[1].target])
  def test2Workers(self):
    num_workers = 2
    replicas_to_aggregate = 2
    num_ps = 2
    workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps)

    # Creates and returns all the workers.
    sessions, graphs, train_ops = get_workers(num_workers,
                                              replicas_to_aggregate, workers)

    # Chief should have already initialized all the variables.
    var_0_g_0 = graphs[0].get_tensor_by_name("v0:0")
    var_1_g_0 = graphs[0].get_tensor_by_name("v1:0")
    local_step_0 = graphs[0].get_tensor_by_name("sync_rep_local_step:0")
    self.assertAllEqual(0.0, sessions[0].run(var_0_g_0))
    self.assertAllEqual(1.0, sessions[0].run(var_1_g_0))
    self.assertAllEqual(0, sessions[0].run(local_step_0))

    # Will just use session 1 to verify all the variables later.
    var_0_g_1 = graphs[1].get_tensor_by_name("v0:0")
    var_1_g_1 = graphs[1].get_tensor_by_name("v1:0")
    var_sparse_g_1 = graphs[1].get_tensor_by_name("v_sparse:0")
    local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0")
    global_step = graphs[1].get_tensor_by_name("global_step:0")

    # The steps should also be initialized.
    self.assertAllEqual(0, sessions[1].run(global_step))
    self.assertAllEqual(0, sessions[1].run(local_step_1))
    self.assertAllClose([[3.0], [4.0]], sessions[1].run(var_sparse_g_1))

    # We have initial tokens in the queue so we can call this one by one. After
    # the first step, this will no longer work as there will be no more extra
    # tokens in the queue.
    sessions[0].run(train_ops[0])
    sessions[1].run(train_ops[1])

    # The global step should have been updated and the variables should now have
    # the new values after the average of the gradients are applied.
    while sessions[1].run(global_step) != 1:
      time.sleep(0.01)

    self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1))
    self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
    self.assertAllClose([[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]],
                        sessions[1].run(var_sparse_g_1))

    # The local step for both workers should still be 0 because the initial
    # tokens in the token queue are 0s. This means that the following
    # computation of the gradients will be wasted as local_step is smaller than
    # the current global step. However, this only happens once when the system
    # just starts and this is necessary to make the system robust for the case
    # when chief gets restarted by errors/preemption/...
    self.assertAllEqual(0, sessions[0].run(local_step_0))
    self.assertAllEqual(0, sessions[1].run(local_step_1))

    sessions[0].run(train_ops[0])
    sessions[1].run(train_ops[1])
    # Although the global step should still be 1 as explained above, the local
    # step should now be updated to 1. The variables are still the same.
    self.assertAllEqual(1, sessions[1].run(global_step))
    self.assertAllEqual(1, sessions[0].run(local_step_0))
    self.assertAllEqual(1, sessions[1].run(local_step_1))
    self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1))
    self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))

    # At this step, the token queue is empty. So the 2 workers need to work
    # together to proceed.
    threads = []
    threads.append(
        self.checkedThread(
            target=self._run, args=(train_ops[0], sessions[0])))
    threads.append(
        self.checkedThread(
            target=self._run, args=(train_ops[1], sessions[1])))

    # The two workers starts to execute the train op.
    for thread in threads:
      thread.start()
    for thread in threads:
      thread.join()

    # The global step should now be 2 and the gradients should have been
    # applied twice.
    self.assertAllEqual(2, sessions[1].run(global_step))
    self.assertAllClose(0 - 2 * (0.1 + 0.3) / 2 * 2.0,
                        sessions[1].run(var_0_g_1))
    self.assertAllClose(1 - 2 * (0.9 + 1.1) / 2 * 2.0,
                        sessions[1].run(var_1_g_1))
  def test3Workers1Backup(self):
    num_workers = 3
    replicas_to_aggregate = 2
    num_ps = 2
    workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps)

    # Creates and returns all the workers.
    sessions, graphs, train_ops = get_workers(num_workers,
                                              replicas_to_aggregate, workers)

    # Chief should have already initialized all the variables.
    var_0_g_1 = graphs[1].get_tensor_by_name("v0:0")
    var_1_g_1 = graphs[1].get_tensor_by_name("v1:0")
    local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0")
    global_step = graphs[1].get_tensor_by_name("global_step:0")

    # The steps should also be initilized.
    self.assertAllEqual(0, sessions[1].run(global_step))
    self.assertAllEqual(0, sessions[1].run(local_step_1))

    # We have initial tokens in the queue so we can call this one by one. After
    # the token queue becomes empty, they should be called concurrently.
    # Here worker 0 and worker 2 finished first.
    sessions[0].run(train_ops[0])
    sessions[2].run(train_ops[2])

    # The global step should have been updated since we only need to collect 2
    # gradients. The variables should now have the new values after the average
    # of the gradients from worker 0/2 are applied.
    while sessions[1].run(global_step) != 1:
      time.sleep(0.01)

    self.assertAllEqual(1, sessions[1].run(global_step))
    self.assertAllClose(0 - (0.1 + 0.5) / 2 * 2.0, sessions[1].run(var_0_g_1))
    self.assertAllClose(1 - (0.9 + 1.3) / 2 * 2.0, sessions[1].run(var_1_g_1))

    # Worker 1 finished later and its gradients will now be dropped as it is
    # stale.
    sessions[1].run(train_ops[1])

    # As shown in the previous test, the local_step for all workers should be
    # still 0 so their next computation will also be dropped.
    sessions[0].run(train_ops[0])
    sessions[1].run(train_ops[1])
    sessions[2].run(train_ops[2])

    # Although the global step should still be 1 as explained above, the local
    # step should now be updated to 1. Just check worker 1 as an example.
    self.assertAllEqual(1, sessions[1].run(global_step))
    self.assertAllEqual(1, sessions[1].run(local_step_1))

    thread_0 = self.checkedThread(
        target=self._run, args=(train_ops[0], sessions[0]))
    thread_1 = self.checkedThread(
        target=self._run, args=(train_ops[1], sessions[1]))

    # Lets worker 0 execute first.
    # It will wait as we need 2 workers to finish this step and the global step
    # should be still 1.
    thread_0.start()
    self.assertAllEqual(1, sessions[1].run(global_step))

    # Starts worker 1.
    thread_1.start()
    thread_1.join()
    thread_0.join()

    # The global step should now be 2 and the gradients should have been
    # applied again.
    self.assertAllEqual(2, sessions[1].run(global_step))
    self.assertAllClose(-0.6 - (0.1 + 0.3) / 2 * 2.0,
                        sessions[1].run(var_0_g_1))
    self.assertAllClose(-1.2 - (0.9 + 1.1) / 2 * 2.0,
                        sessions[1].run(var_1_g_1))
示例#27
0
    def setUp(self):
        super(SingleWorkerTest, self).setUp()

        workers, _ = test_util.create_local_cluster(1, 0)
        remote.connect_to_remote_host(workers[0].target)
示例#28
0
    def setUp(self):
        super(MultiWorkersTest, self).setUp()

        workers, _ = test_util.create_local_cluster(3, 0)
        remote.connect_to_remote_host(
            [workers[0].target, workers[1].target, workers[2].target])
示例#29
0
 def setUp(self):
     super(SingleWorkerTestBaseEager, self).setUp()
     workers, _ = test_util.create_local_cluster(num_workers=1, num_ps=0)
     remote.connect_to_remote_host(workers[0].target)