def fn(functions_scheduled_event, test_finished_event):
            # TODO(b/170664373): This is needed for TF2 parameter server training in
            # OSS. Remove this when resolved.
            os.environ["GRPC_FAIL_FAST"] = "use_caller"

            cluster_resolver = TFConfigClusterResolver()
            if cluster_resolver.task_type != "chief":
                utils.start_server(cluster_resolver, "grpc")
            strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
                cluster_resolver)
            ps_coordinator = coordinator_lib.ClusterCoordinator(strategy)

            with strategy.scope():
                v = variables.Variable(initial_value=0, dtype=dtypes.int32)

            @def_function.function
            def worker_fn():
                # An ever-running function.
                for _ in math_ops.range(100000):
                    v.assign_add(1)

            # Keep the two workers occupied.
            ps_coordinator.schedule(worker_fn)
            ps_coordinator.schedule(worker_fn)
            # Now the main process can terminate.
            functions_scheduled_event.set()

            # Verified that join and schedule indeed raise UnavailableError.
            try:
                if test_join:
                    ps_coordinator.join()
                if test_schedule:
                    while ps_coordinator.cluster._closure_queue._error is None:
                        time.sleep(1)
                    ps_coordinator.schedule(worker_fn)
            except errors.UnavailableError:
                # The following verifies that after PS fails, continue executing
                # functions on workers should fail and indicate it's PS failure.
                for worker_id in range(3):
                    with ops.device(
                            "/job:worker/replica:0/task:{}".format(worker_id)):
                        try:
                            # Executing a function after PS fails should result in a PS
                            # failure.
                            worker_fn()
                        except Exception as e:  # pylint: disable=broad-except
                            if coordinator_lib._is_ps_failure(e):
                                if worker_id < 2:
                                    continue
                                logging.info(
                                    "_test_translate_ps_failure_error ends properly."
                                )
                                # Now we can safely exit the test.
                                test_finished_event.set()
                                return
                        raise RuntimeError(
                            "Executing a function after PS fails, should "
                            "result in a PS failure.")

            raise RuntimeError("UnavailableError supposed to be raised.")
예제 #2
0
  def __init__(self, container_strategy, devices=None, cross_device_ops=None):
    super(MirroredExtended, self).__init__(container_strategy)
    if context.executing_eagerly():
      if devices and not _is_device_list_single_worker(devices):
        raise RuntimeError("In-graph multi-worker training with "
                           "`MirroredStrategy` is not supported in eager mode.")
      else:
        if TFConfigClusterResolver().cluster_spec().as_dict():
          # if you are executing in eager mode, only the single machine code
          # path is supported.
          logging.info("Initializing local devices since in-graph multi-worker "
                       "training with `MirroredStrategy` is not supported in "
                       "eager mode. TF_CONFIG will be ignored when "
                       "when initializing `MirroredStrategy`.")
        devices = devices or all_local_devices()
    else:
      devices = devices or all_devices()

    assert devices, ("Got an empty `devices` list and unable to recognize "
                     "any local devices.")
    self._cross_device_ops = cross_device_ops
    self._communication_options = collective_util.Options()
    self._initialize_strategy(devices)

    # TODO(b/128995245): Enable last partial batch support in graph mode.
    if ops.executing_eagerly_outside_functions():
      self.experimental_enable_get_next_as_optional = True

    # Flag to turn on VariablePolicy.
    self._use_var_policy = False
예제 #3
0
    def testParameterOverrides(self):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "ps": ["ps0:2222", "ps1:2222"],
        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
      },
      "rpc_layer": "grpc",
      "task": {
        "type": "ps",
        "index": 1
      }
    }
    """

        cluster_resolver = TFConfigClusterResolver(task_type='ps', task_id=0)

        self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
        self.assertEqual('ps', cluster_resolver.task_type)
        self.assertEqual(0, cluster_resolver.task_id)

        cluster_resolver.task_type = 'worker'
        cluster_resolver.task_id = 1
        cluster_resolver.rpc_layer = 'test'

        self.assertEqual('test://worker1:2222', cluster_resolver.master())
        self.assertEqual('worker', cluster_resolver.task_type)
        self.assertEqual(1, cluster_resolver.task_id)
        self.assertEqual('test', cluster_resolver.rpc_layer)
예제 #4
0
    def testZeroItemsInClusterSpecMasterRead(self):
        os.environ['TF_CONFIG'] = """
    {}
    """

        cluster_resolver = TFConfigClusterResolver()
        self.assertEqual('', cluster_resolver.master())
예제 #5
0
        def task_function(start_events, finish_events):
            cluster_resolver = TFConfigClusterResolver()
            cluster_spec = cluster_resolver.cluster_spec()
            task_type = cluster_resolver.task_type
            task_id = cluster_resolver.task_id
            rpc_layer = cluster_resolver.rpc_layer

            logging.info(
                'Starting server with cluster_spec = %r, task_type = %r, '
                'task_id = %r, rpc_layer = %r', cluster_spec, task_type,
                task_id, rpc_layer)

            # TODO(yuefengz): support GPU clusters.
            server_config = config_pb2.ConfigProto()
            server_config.device_count['GPU'] = 0

            # Set the environment variable to prevent hanging upon job failure and
            # restart. Note that it defaults to 'use_caller' at Google, but defaults
            # to False in OSS.
            os.environ['GRPC_FAIL_FAST'] = 'use_caller'

            server_lib.Server(cluster_spec,
                              job_name=task_type,
                              protocol=rpc_layer,
                              task_index=task_id,
                              config=server_config,
                              start=True)

            start_event = start_events[task_type][task_id]
            start_event.set()

            finish_event = finish_events[task_type][task_id]
            finish_event.wait()

            os._exit(0)  # pylint: disable=protected-access
예제 #6
0
def get_num_workers():
    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        task_type = cluster_resolver.task_type
        return int(multi_worker_util.worker_count(cluster_spec, task_type))
    return 1
예제 #7
0
def all_devices():
  devices = []
  tfconfig = TFConfigClusterResolver()
  if tfconfig.cluster_spec().as_dict():
    devices = _cluster_spec_to_device_list(tfconfig.cluster_spec(),
                                           context.num_gpus())
  return devices if devices else all_local_devices()
예제 #8
0
        def task_function(start_events, finish_events):
            cluster_resolver = TFConfigClusterResolver()
            cluster_spec = cluster_resolver.cluster_spec()
            task_type = cluster_resolver.task_type
            task_id = cluster_resolver.task_id
            rpc_layer = cluster_resolver.rpc_layer

            logging.info(
                'Starting server with cluster_spec = %r, task_type = %r, '
                'task_id = %r, rpc_layer = %r', cluster_spec, task_type,
                task_id, rpc_layer)

            # TODO(yuefengz): support GPU clusters.
            server_config = config_pb2.ConfigProto()
            server_config.device_count['GPU'] = 0

            server_lib.Server(cluster_spec,
                              job_name=task_type,
                              protocol=rpc_layer,
                              task_index=task_id,
                              config=server_config,
                              start=True)

            start_event = start_events[task_type][task_id]
            start_event.set()

            finish_event = finish_events[task_type][task_id]
            finish_event.wait()

            os._exit(0)  # pylint: disable=protected-access
예제 #9
0
 def _from_local_devices(
     cls,
     devices,
     communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
   """A convenience method to create an object with a list of devices."""
   obj = cls(communication)
   obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
   return obj
 def __init__(self,
              container_strategy,
              cluster_resolver=TFConfigClusterResolver()):
     distribute_lib.DistributionStrategyExtended.__init__(
         self, container_strategy)
     self._cross_device_ops = None
     self._initialize_strategy(cluster_resolver)
     assert isinstance(self._get_cross_device_ops(),
                       cross_device_ops_lib.CollectiveAllReduce)
  def __init__(self,
               container_strategy,
               cluster_resolver=TFConfigClusterResolver()):
    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
    self._initialize_strategy(cluster_resolver)

    # We typically don't need to do all-reduce in this strategy.
    self._cross_device_ops = (
        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
 def __init__(self, container_strategy, communication, cluster_resolver):
     cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
     assert isinstance(communication,
                       cross_device_ops_lib.CollectiveCommunication)
     self._communication = communication
     self._initialize_strategy(cluster_resolver)
     assert isinstance(self._get_cross_device_ops(),
                       cross_device_ops_lib.CollectiveAllReduce)
예제 #13
0
        def proc_func(functions_scheduled_event, test_finished_event):
            cluster_resolver = TFConfigClusterResolver()
            if cluster_resolver.task_type != "chief":
                utils.start_server(cluster_resolver, "grpc")
            ps_client = parameter_server_client.ParameterServerClient(
                cluster_resolver)
            with ps_client._strategy.scope():
                v = variables.Variable(initial_value=0, dtype=dtypes.int32)

            @def_function.function
            def worker_fn():
                # An ever-running function.
                for _ in math_ops.range(100000):
                    v.assign_add(1)

            # Keep the two workers occupied.
            ps_client.schedule(worker_fn)
            ps_client.schedule(worker_fn)
            # Now the main process can terminate.
            functions_scheduled_event.set()

            # Verified that join and schedule indeed raise
            # ParameterServerFailureError.
            try:
                if test_join:
                    ps_client.join()
                if test_schedule:
                    while ps_client.cluster._closure_queue._error is None:
                        time.sleep(1)
                    ps_client.schedule(worker_fn)
            except client.ParameterServerFailureError:
                # The following verifies that after PS fails, continue executing
                # functions on workers should fail and indicate it's PS failure.
                for worker_id in range(3):
                    with ops.device(
                            "/job:worker/replica:0/task:{}".format(worker_id)):
                        try:
                            # Executing a function after PS fails should result in a PS
                            # failure.
                            worker_fn()
                        except Exception as e:  # pylint: disable=broad-except
                            if client._is_ps_failure(e):
                                if worker_id < 2:
                                    continue
                                logging.info(
                                    "_test_translate_ps_failure_error ends properly."
                                )
                                # Now we can safely exit the test.
                                test_finished_event.set()
                                return
                        raise RuntimeError(
                            "Executing a function after PS fails, should "
                            "result in a PS failure.")

            raise RuntimeError(
                "ParameterServerFailureError supposed to be raised.")
 def __init__(self, cluster_resolver=None):
     """Initializes this strategy."""
     # The `cluster_resolver` must be set so that
     # `ParameterServerStrategyExtended` will keep num_gpus for `configure`
     # method.
     if cluster_resolver is None:
         cluster_resolver = TFConfigClusterResolver()
     extended = parameter_server_strategy.ParameterServerStrategyExtended(
         self, cluster_resolver=cluster_resolver)
     super(ParameterServerStrategy, self).__init__(extended)
 def __init__(self, container_strategy, communication, cluster_resolver):
     self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
     assert isinstance(communication,
                       cross_device_ops_lib.CollectiveCommunication)
     self._communication = communication
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
 def __init__(self, container_strategy, cluster_resolver,
              communication_options):
     self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
     self._communication_options = communication_options
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
     self.experimental_enable_get_next_as_optional = True
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
예제 #17
0
    def testOneItemInClusterSpecMasterRead(self):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "worker": ["worker0:2222"]
      }
    }
    """

        cluster_resolver = TFConfigClusterResolver()
        self.assertEqual('', cluster_resolver.master())
예제 #18
0
def maybe_shard_dataset(dataset):
    """Shard the dataset if running in multi-node environment."""
    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        dataset = dataset.shard(
            multi_worker_util.worker_count(cluster_spec,
                                           cluster_resolver.task_type),
            multi_worker_util.id_in_cluster(cluster_spec,
                                            cluster_resolver.task_type,
                                            cluster_resolver.task_id))
    return dataset
예제 #19
0
 def __init__(self, container_strategy, num_gpus_per_worker):
   # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
   # the constructor's interface to allow customized cluster resolver. Use
   # SimpleClusterResolver to override num_accelerators.
   tfconfig = TFConfigClusterResolver()
   cluster_resolver = SimpleClusterResolver(
       cluster_spec=tfconfig.cluster_spec(),
       task_type=tfconfig.task_type,
       task_id=tfconfig.task_id,
       num_accelerators=num_gpus_per_worker)
   super(CollectiveAllReduceExtended, self).__init__(
       container_strategy, cluster_resolver=cluster_resolver)
 def testTaskIndexOverride(self):
     os.environ['TF_CONFIG'] = """
 {
   "cluster": {
     "worker": ["worker0:2222", "worker1:2222"]
   },
   "task": {
     "type": "worker",
     "index": "0"
   }
 }
 """
     cluster_resolver = TFConfigClusterResolver(task_id=1)
     self.assertEqual(1, cluster_resolver.task_id)
예제 #21
0
def batch_and_maybe_shard_dataset(dataset, global_batch_size):
    """Shard the dataset if running in multi-node environment."""

    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        num_workers = int(
            multi_worker_util.worker_count(cluster_spec, task_type))
        id_in_cluster = int(
            multi_worker_util.id_in_cluster(cluster_spec, task_type, task_id))
        dataset = dataset.shard(num_workers, id_in_cluster)
    return dataset.batch(global_batch_size)
예제 #22
0
  def __init__(self, cluster_resolver=None):
    """Initializes this strategy with an optional `cluster_resolver`.

    Args:
      cluster_resolver: Optional
        `tf.distribute.cluster_resolver.ClusterResolver` object. Defaults to a
        `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
    """
    if cluster_resolver is None:
      cluster_resolver = TFConfigClusterResolver()
    super(ParameterServerStrategyV1, self).__init__(
        ParameterServerStrategyExtended(
            self, cluster_resolver=cluster_resolver))
    distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
        "ParameterServerStrategy")
예제 #23
0
  def __init__(self, cluster_resolver=None):
    """Initializes this strategy.

    Args:
      cluster_resolver: Optional
        `tf.distribute.cluster_resolver.ClusterResolver` object. Defaults to a
        `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
    """
    if cluster_resolver is None:
      cluster_resolver = TFConfigClusterResolver()
    if not cluster_resolver.cluster_spec():
      raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
    extended = ParameterServerStrategyExtended(
        self, cluster_resolver=cluster_resolver)
    super(ParameterServerStrategy, self).__init__(extended)
예제 #24
0
    def testSpecifiedTaskTypeAndIndexMasterRead(self):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "ps": ["ps0:2222", "ps1:2222"],
        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
      },
      "task": {
        "type": "ps",
        "index": 0
      }
    }
    """

        cluster_resolver = TFConfigClusterResolver()
        self.assertEqual('worker1:2222', cluster_resolver.master('worker', 1))
 def testTaskTypeCastToString(self):
   os.environ['TF_CONFIG'] = """
   {
     "cluster": {
       "123456": ["ps0:2222", "ps1:2222"],
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     },
     "rpc_layer": "grpc",
     "task": {
       "type": 123456,
       "index": 0
     }
   }
   """
   cluster_resolver = TFConfigClusterResolver()
   self.assertEqual('123456', cluster_resolver.task_type)
 def testTaskIndexCastToInteger(self):
   os.environ['TF_CONFIG'] = """
   {
     "cluster": {
       "ps": ["ps0:2222", "ps1:2222"],
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     },
     "rpc_layer": "grpc",
     "task": {
       "type": "ps",
       "index": "1"
     }
   }
   """
   cluster_resolver = TFConfigClusterResolver()
   self.assertEqual(1, cluster_resolver.task_id)
예제 #27
0
    def testAutomaticMasterRead(self):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "ps": ["ps0:2222", "ps1:2222"],
        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
      },
      "task": {
        "type": "ps",
        "index": 0
      }
    }
    """

        cluster_resolver = TFConfigClusterResolver()
        self.assertEqual('ps0:2222', cluster_resolver.master())
예제 #28
0
 def __init__(self, container_strategy, cluster_resolver,
              communication_options):
     if not isinstance(communication_options, collective_util.Options):
         raise ValueError("communication_options must be an instance of "
                          "tf.distribute.experimental.CommunicationOptions")
     self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     if not isinstance(self._cluster_resolver, ClusterResolver):
         raise ValueError("cluster_resolver must be an instance of "
                          "tf.distribute.cluster_resolver.ClusterResolver")
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
     self._communication_options = communication_options
     self._collective_key_base = container_strategy._collective_key_base  # pylint: disable=protected-access
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
     self.experimental_enable_get_next_as_optional = True
     assert isinstance(self._cross_device_ops,
                       cross_device_ops_lib.CollectiveAllReduce)
  def __init__(self, cluster_resolver=None):
    """Initializes this strategy with an optional `cluster_resolver`.

    Args:
      cluster_resolver: Optional
        `tf.distribute.cluster_resolver.ClusterResolver` object. Defaults to a
        `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
    """
    if cluster_resolver is None:
      cluster_resolver = TFConfigClusterResolver()
    if not cluster_resolver.cluster_spec():
      raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
    extended = ParameterServerStrategyExtended(
        self, cluster_resolver=cluster_resolver)
    super(ParameterServerStrategy, self).__init__(extended)
    distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
        "ParameterServerStrategy")
    distribute_lib.distribution_strategy_replica_gauge.get_cell("num_ps").set(
        len(self.extended.parameter_devices))
예제 #30
0
    def testTaskTypeIndexRpcRead(self):
        os.environ['TF_CONFIG'] = """
    {
      "cluster": {
        "ps": ["ps0:2222", "ps1:2222"],
        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
      },
      "rpc_layer": "grpc",
      "task": {
        "type": "ps",
        "index": 0
      }
    }
    """

        cluster_resolver = TFConfigClusterResolver()
        self.assertEqual('ps', cluster_resolver.task_type)
        self.assertEqual(0, cluster_resolver.task_id)
        self.assertEqual('grpc', cluster_resolver.rpc_layer)