Exemplo n.º 1
0
    def test_dataset_creator_usage_in_parameter_server_model_fit(self):
        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=2, num_ps=1, rpc_layer="grpc")
        cluster_def["chief"] = [
            "localhost:%d" % multi_worker_test_base.pick_unused_port()
        ]
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
        with strategy.scope():
            model = sequential.Sequential([core_layers.Dense(10)])
        model.compile(gradient_descent.SGD(), loss="mse")

        def dataset_fn(input_context):
            global_batch_size = 64
            batch_size = input_context.get_per_replica_batch_size(
                global_batch_size)
            dataset = dataset_ops.DatasetV2.from_tensors(([1.], [1.])).repeat()
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)
            dataset = dataset.batch(batch_size)
            dataset = dataset.prefetch(2)
            return dataset

        history = model.fit(dataset_creator.DatasetCreator(dataset_fn),
                            epochs=10,
                            steps_per_epoch=10,
                            verbose=0)
        self.assertLen(history.history["loss"], 10)
Exemplo n.º 2
0
def make_parameter_server_cluster(num_workers, num_ps):
  cluster_def = multi_worker_test_base.create_in_process_cluster(
      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
  cluster_def["chief"] = [
      "localhost:%d" % multi_worker_test_base.pick_unused_port()
  ]
  return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
    def testEagerCustomTrainingUnimplementedError(self):
        cluster_spec = multi_worker_test_base.create_in_process_cluster(
            num_workers=3, num_ps=2)
        cluster_resolver = SimpleClusterResolver(
            cluster_spec=multi_worker_util.normalize_cluster_spec(
                cluster_spec),
            task_type='worker',
            task_id=1,
            num_accelerators={'GPU': 0})
        strategy = parameter_server_strategy.ParameterServerStrategy(
            cluster_resolver)
        dataset = dataset_ops.DatasetV2.from_tensor_slices([5., 6., 7., 8.])

        def train_step(data):
            return math_ops.square(data)

        self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
                               strategy.experimental_distribute_dataset,
                               dataset.batch(2))

        self.assertRaisesRegex(
            NotImplementedError, 'ParameterServerStrategy*',
            strategy.experimental_distribute_datasets_from_function,
            lambda _: dataset)

        self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
                               strategy.scope)

        self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
                               strategy.run, train_step)
Exemplo n.º 4
0
 def setUpClass(cls):
     super(ParameterServerStrategyV2Test, cls).setUpClass()
     cluster_def = multi_worker_test_base.create_in_process_cluster(
         num_workers=2, num_ps=3)
     cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
     remote.connect_to_cluster(cls.cluster_resolver.cluster_spec(),
                               job_name="chief")
def make_coordinator(num_workers, num_ps):
    cluster_def = multi_worker_test_base.create_in_process_cluster(
        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
    cluster_def["chief"] = [
        "localhost:%d" % multi_worker_test_base.pick_unused_port()
    ]
    cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def),
                                             rpc_layer="grpc")
    return tf.distribute.experimental.coordinator.ClusterCoordinator(
        tf.distribute.experimental.ParameterServerStrategy(cluster_resolver))
Exemplo n.º 6
0
    def _create_parameter_server():
        if framework_test_util.is_xla_enabled():
            # To address test failures resulting in XLA with MultiProcessRunner,
            # continue to use in-process cluster for XLA tests.
            cluster_def = multi_worker_test_base.create_in_process_cluster(
                num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                rpc_layer="grpc")
            return _create_ps_strategy(resolver, variable_partitioner)
        else:
            tf_config = cluster_resolver.TFConfigClusterResolver()
            cluster_def = tf_config.cluster_spec().as_dict()
            if not cluster_def:
                # When MultiProcessRunner cluster is used, the cluster is not created
                # initially when the decorator is called. When the test runs, initially
                # this method is invoked via decorator before setting up the
                # MultiProcessRunner with worker and ps in the combinations.py. After
                # setup is done, the subprocess invokes this method again to get
                # strategy object. We return None strategy when the main thread invokes
                # this method before setting up cluster.
                # Returning None is fine here, since this thread will proceed to create
                # MultiProcessRunner and invoke tests with decorator inside
                # subprocesses.
                return None
            # MultiProcessRunner is already setup and this method is invoked from a
            # subprocess running the actual test.
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                task_type=tf_config.task_type,
                task_id=tf_config.task_id,
                environment=tf_config.environment,
                rpc_layer=tf_config.rpc_layer or "grpc")
            if tf_config.task_type in ("worker", "ps"):
                worker_config = config_pb2.ConfigProto()
                worker_config.inter_op_parallelism_threads = 4  # max num_workers + 1

                try:
                    server = server_lib.Server(cluster_def,
                                               job_name=tf_config.task_type,
                                               task_index=tf_config.task_id,
                                               protocol="grpc",
                                               config=worker_config)
                except errors.UnknownError as e:
                    if "Could not start gRPC server" in e.message:
                        raise unittest.SkipTest("Cannot start std servers.")
                    else:
                        raise

                # Blocking the process that starts a server from exiting.
                server.join()

            return _create_ps_strategy(resolver, variable_partitioner)
Exemplo n.º 7
0
def make_client(num_workers, num_ps):
    cluster_def = multi_worker_test_base.create_in_process_cluster(
        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
    cluster_def["chief"] = [
        "localhost:%d" % multi_worker_test_base.pick_unused_port()
    ]
    cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def),
                                             rpc_layer="grpc")
    return client_lib.Client(
        parameter_server_strategy_v2.ParameterServerStrategyV2(
            cluster_resolver))
def make_coordinator(num_workers, num_ps):
  # TODO(rchao): Test the internal rpc_layer version.
  cluster_def = multi_worker_test_base.create_in_process_cluster(
      num_workers=num_workers, num_ps=num_ps, rpc_layer='grpc')
  cluster_def['chief'] = [
      'localhost:%d' % multi_worker_test_base.pick_unused_port()
  ]
  cluster_resolver = SimpleClusterResolver(
      ClusterSpec(cluster_def), rpc_layer='grpc')
  strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
      cluster_resolver)
  return coordinator_lib.ClusterCoordinator(strategy)
Exemplo n.º 9
0
    def _create_parameter_server():

        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
        resolver = cluster_resolver.SimpleClusterResolver(
            ClusterSpec(cluster_def),
            num_accelerators={"GPU": required_gpus},
            rpc_layer="grpc")
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            resolver,
            variable_partitioner=sharded_variable.FixedShardsPartitioner(2))
        return strategy
Exemplo n.º 10
0
def get_cluster_def(num_workers, num_ps):
  if num_workers > MAX_NUM_WORKER or num_ps > MAX_NUM_PS:
    raise ValueError("Requesting more servers than the maximum, adjust"
                     "MAX_NUM_PS and MAX_NUM_WORKER")
  global _cluster
  if _cluster is None:
    _cluster = multi_worker_test_base.create_in_process_cluster(
        num_workers=MAX_NUM_WORKER, num_ps=MAX_NUM_PS)
  return {
      "worker": _cluster["worker"][:num_workers],
      "ps": _cluster["ps"][:num_ps],
  }
Exemplo n.º 11
0
def get_cluster_def(cluster_params, num_workers, num_ps):
    if (num_workers > cluster_params.max_num_worker
            or num_ps > cluster_params.max_num_ps):
        raise ValueError("Requesting more servers than the maximum, adjust"
                         "cluster params' max_num_ps and max_num_worker")
    if cluster_params.cluster is None:
        cluster_params.cluster = multi_worker_test_base.create_in_process_cluster(
            num_workers=cluster_params.max_num_worker,
            num_ps=cluster_params.max_num_ps)
    return {
        "worker": cluster_params.cluster["worker"][:num_workers],
        "ps": cluster_params.cluster["ps"][:num_ps],
    }
Exemplo n.º 12
0
    def test_dataset_creator_usage_in_parameter_server_model_fit(self):
        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=2, num_ps=1, rpc_layer="grpc")
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
        with strategy.scope():
            model = sequential.Sequential([core_layers.Dense(10)])
        model.compile(gradient_descent.SGD(), loss="mse")

        history = model.fit(dataset_creator.DatasetCreator(
            self._get_dataset_fn()),
                            epochs=10,
                            steps_per_epoch=10,
                            verbose=0)
        self.assertLen(history.history["loss"], 10)
Exemplo n.º 13
0
    def testClientMetrics(self):
        if sys.version_info >= (3, 8) and platform.system() == 'Windows':
            # TODO(b/165013260): Fix this
            self.skipTest(
                'Test is currently broken on Windows with Python 3.8')

        metric_utils.enable_metrics = True

        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=1, num_ps=1, rpc_layer=self.get_rpc_layer())
        cluster_def['chief'] = [
            'localhost:%d' % multi_worker_test_base.pick_unused_port()
        ]
        cluster_resolver = SimpleClusterResolver(
            ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            cluster_resolver)
        cluster = client.Cluster(strategy)

        @def_function.function
        def func():
            time.sleep(0.5)
            return 3

        result = cluster.schedule(func, args=None, kwargs=None)
        result = cluster.schedule(func, args=None, kwargs=None)
        cluster.join()
        self.assertEqual(result._get_value().numpy(), 3)

        # Tracing, closure execution, and remote_value fetching should be executed
        # exactly once for running this function.
        metric_tracing = metric_utils.get_metric_summary('function_tracing')
        self.assertEqual(metric_tracing['num'], 1)
        # Tracing time should be longer than the sleep time in Python function.
        self.assertGreater(metric_tracing['sum'], 0.5)
        metric_closure = metric_utils.get_metric_summary('closure_execution')
        self.assertEqual(metric_closure['num'], 2)
        metric_remote_value = metric_utils.get_metric_summary(
            'remote_value_fetch')
        self.assertEqual(metric_remote_value['num'], 2)
Exemplo n.º 14
0
def make_parameter_server_cluster(num_workers, num_ps):
  cluster_def = multi_worker_test_base.create_in_process_cluster(
      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
  return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
 def setUpClass(cls):
     super().setUpClass()
     cluster_def = multi_worker_test_base.create_in_process_cluster(
         num_workers=2, num_ps=2)
     cls.cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
         tf.train.ClusterSpec(cluster_def))
Exemplo n.º 16
0
 def setUpClass(cls):
   """Create a local cluster with 3 workers."""
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=NUM_WORKERS, num_ps=0)
 def setUpClass(cls):
   """Create a local cluster with 3 workers and 1 chief."""
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=3, num_ps=0, has_chief=True)
Exemplo n.º 18
0
 def setUpClass(cls):
   """Create a local cluster with 2 workers."""
   super(KerasMultiWorkerTestStandaloneClient, cls).setUpClass()
   cls._cluster_spec = test_base.create_in_process_cluster(
       num_workers=2, num_ps=1, has_eval=False)
Exemplo n.º 19
0
 def _get_parameter_server_strategy(self):
     cluster_def = multi_worker_test_base.create_in_process_cluster(
         num_workers=2, num_ps=1, rpc_layer="grpc")
     return tf.distribute.experimental.ParameterServerStrategy(
         SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
 def setUpClass(cls):
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_chief=True)
     cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 def setUpClass(cls):
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=3, num_ps=2)
   cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 def setUpClass(cls):
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 def setUpClass(cls):
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=3, num_ps=2, has_chief=True)
   cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 def setUpClass(cls):
   """Create a local cluster with 2 workers."""
   super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=3, num_ps=2, has_eval=True)
 def setUpClass(cls):
   super(VariablePartitioningTest, cls).setUpClass()
   cluster_def = multi_worker_test_base.create_in_process_cluster(
       num_workers=2, num_ps=2)
   cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
Exemplo n.º 26
0
 def setUpClass(cls):
     """Create a local cluster with 2 workers."""
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_eval=True)
Exemplo n.º 27
0
 def setUpClass(cls):
     """Create a local cluster with 3 workers and 1 chief."""
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
Exemplo n.º 28
0
 def setUpClass(cls):
   """Create a local cluster with 2 workers and 1 chief."""
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=2, num_ps=0, has_chief=True)
   cls._default_target = "grpc://" + cls._cluster_spec["chief"][0]
Exemplo n.º 29
0
 def setUpClass(cls):
   """Create a local cluster with 2 workers."""
   super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
   cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
       num_workers=3, num_ps=2, has_eval=True)
 def setUpClass(cls):
   super(ParameterServerStrategyV2Test, cls).setUpClass()
   cluster_def = multi_worker_test_base.create_in_process_cluster(
       num_workers=2, num_ps=3)
   cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))