def testRetainSparseJobWithNoMerging(self):
        base_cluster_spec = server_lib.ClusterSpec({
            "worker": {
                1: "worker0:2222",
                3: "worker1:2222",
                5: "worker2:2222"
            }
        })

        base_cluster_resolver = SimpleClusterResolver(base_cluster_spec)
        union_cluster = UnionClusterResolver(base_cluster_resolver)
        cluster_spec = union_cluster.cluster_spec()

        expected_proto = """
    job { name: 'worker' tasks { key: 1 value: 'worker0:2222' }
                         tasks { key: 3 value: 'worker1:2222' }
                         tasks { key: 5 value: 'worker2:2222' } }
    """
        self._verifyClusterSpecEquality(cluster_spec, expected_proto)
Пример #2
0
    def test_update_config_proto(self):
        resolver = get_tpu_cluster_resolver()
        remote.connect_to_cluster(resolver)
        tpu_strategy_util.initialize_tpu_system(resolver)
        strategy = tpu_lib.TPUStrategyV2(resolver)

        config_proto = config_pb2.ConfigProto()
        cluster_spec = server_lib.ClusterSpec({"worker": ["fake1", "fake2"]})
        with test.mock.patch.object(resolver,
                                    "cluster_spec",
                                    return_value=cluster_spec):
            new_config = strategy.update_config_proto(config_proto)

        # Verify cluster_def.
        self.assertProtoEquals(cluster_spec.as_cluster_def(),
                               new_config.cluster_def)

        # Verify isolate_session_state
        self.assertTrue(new_config.isolate_session_state)
Пример #3
0
    def testClusterSpecAccessors(self):
        original_dict = {
            "ps": ["ps0:2222", "ps1:2222"],
            "worker": ["worker0:2222", "worker1:2222", "worker2:2222"],
            "sparse": {
                0: "sparse0:2222",
                3: "sparse3:2222"
            }
        }
        cluster_spec = server_lib.ClusterSpec(original_dict)

        self.assertEqual(original_dict, cluster_spec.as_dict())

        self.assertEqual(2, cluster_spec.num_tasks("ps"))
        self.assertEqual(3, cluster_spec.num_tasks("worker"))
        self.assertEqual(2, cluster_spec.num_tasks("sparse"))
        with self.assertRaises(ValueError):
            cluster_spec.num_tasks("unknown")

        self.assertEqual("ps0:2222", cluster_spec.task_address("ps", 0))
        self.assertEqual("sparse0:2222",
                         cluster_spec.task_address("sparse", 0))
        with self.assertRaises(ValueError):
            cluster_spec.task_address("unknown", 0)
        with self.assertRaises(ValueError):
            cluster_spec.task_address("sparse", 2)

        self.assertEqual([0, 1], cluster_spec.task_indices("ps"))
        self.assertEqual([0, 1, 2], cluster_spec.task_indices("worker"))
        self.assertEqual([0, 3], cluster_spec.task_indices("sparse"))
        with self.assertRaises(ValueError):
            cluster_spec.task_indices("unknown")

        # NOTE(mrry): `ClusterSpec.job_tasks()` is not recommended for use
        # with sparse jobs.
        self.assertEqual(["ps0:2222", "ps1:2222"],
                         cluster_spec.job_tasks("ps"))
        self.assertEqual(["worker0:2222", "worker1:2222", "worker2:2222"],
                         cluster_spec.job_tasks("worker"))
        self.assertEqual(["sparse0:2222", None, None, "sparse3:2222"],
                         cluster_spec.job_tasks("sparse"))
        with self.assertRaises(ValueError):
            cluster_spec.job_tasks("unknown")
    def testPS2TasksWithDevice(self):
        cluster_spec = server_lib.ClusterSpec({
            "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
            "moon": ["moon0:2222", "moon1:2222"]
        })

        with ops.device(
                device_setter.replica_device_setter(
                    ps_device="/job:moon",
                    worker_device="/job:sun",
                    cluster=cluster_spec.as_cluster_def())):
            v = variables.Variable([1, 2])
            w = variables.Variable([2, 1])
            a = v + w
            self.assertDeviceEqual("/job:moon/task:0", v.device)
            self.assertDeviceEqual("/job:moon/task:0", v.initializer.device)
            self.assertDeviceEqual("/job:moon/task:1", w.device)
            self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
            self.assertDeviceEqual("/job:sun", a.device)
Пример #5
0
  def cluster_spec(self):
    """Returns a ClusterSpec object based on the latest info from Kubernetes.

    We retrieve the information from the Kubernetes master every time this
    method is called.

    Returns:
      A ClusterSpec containing host information returned from Kubernetes.

    Raises:
      RuntimeError: If any of the pods returned by the master is not in the
        `Running` phase.
    """
    if self._override_client:
      client = self._override_client
    else:
      from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
      from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top

      k8sconfig.load_kube_config()
      client = k8sclient.CoreV1Api()

    cluster_map = {}

    for tf_job in self._job_to_label_mapping:
      all_pods = []
      for selector in self._job_to_label_mapping[tf_job]:
        ret = client.list_pod_for_all_namespaces(label_selector=selector)
        selected_pods = []

        # Sort the list by the name to make sure it doesn't change call to call.
        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
          if pod.status.phase == 'Running':
            selected_pods.append(
                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
          else:
            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
                               (pod.metadata.name, pod.status.phase))
        all_pods.extend(selected_pods)
      cluster_map[tf_job] = all_pods

    return server_lib.ClusterSpec(cluster_map)
Пример #6
0
    def testOverrideSimpleClusterResolver(self):
        base_cluster_spec = server_lib.ClusterSpec({
            "ps": ["ps0:2222", "ps1:2222"],
            "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
        })

        simple_resolver = SimpleClusterResolver(base_cluster_spec,
                                                task_type="ps",
                                                task_index=1,
                                                environment="cloud",
                                                num_accelerators=8,
                                                rpc_layer="grpc")

        simple_resolver.task_type = "worker"
        simple_resolver.task_index = 2
        simple_resolver.rpc_layer = "http"

        self.assertEqual(simple_resolver.task_type, "worker")
        self.assertEqual(simple_resolver.task_index, 2)
        self.assertEqual(simple_resolver.rpc_layer, "http")
 def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     ea_coustom = ElasticAverageCustomGetter(
         worker_device="/job:worker/task:0")
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
          variable_scope.variable_scope("", custom_getter=ea_coustom):
         v = variable_scope.get_variable(initializer=[1, 2], name="v")
         w = variable_scope.get_variable(initializer=[2, 1], name="w")
         v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
         self.assertDeviceEqual("/job:worker/task:0", v.device)
         self.assertDeviceEqual("job:ps/task:0", v_g.device)
         self.assertDeviceEqual("/job:worker/task:0", w.device)
         self.assertDeviceEqual("job:ps/task:1", w_g.device)
Пример #8
0
def create_multi_process_cluster(num_workers,
                                 num_ps,
                                 has_chief=False,
                                 has_eval=False,
                                 rpc_layer='grpc',
                                 stream_output=False,
                                 collective_leader=None):
  cluster_spec = create_cluster_spec(
      has_chief=has_chief,
      num_workers=num_workers,
      num_ps=num_ps,
      has_eval=has_eval)

  cluster = MultiProcessCluster(
      SimpleClusterResolver(
          server_lib.ClusterSpec(cluster_spec), rpc_layer=rpc_layer),
      stream_output=stream_output,
      collective_leader=collective_leader)
  cluster.start()
  return cluster
    def benchmarkPartitionedVariables(self):
        def _input_fn():
            features = {
                'language':
                sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            labels = constant_op.constant(((1, ), (0, ), (0, )))
            return features, labels

        # The given hash_bucket_size results in variables larger than the
        # default min_slice_size attribute, so the variables are partitioned.
        sparse_feature = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=2e7)
        embedding_feature = feature_column.embedding_column(sparse_feature,
                                                            dimension=1)

        tf_config = {
            'cluster': {
                run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
            }
        }
        with test.mock.patch.dict('os.environ',
                                  {'TF_CONFIG': json.dumps(tf_config)}):
            config = run_config.RunConfig()
            # Because we did not start a distributed cluster, we need to pass an
            # empty ClusterSpec, otherwise the device_setter will look for
            # distributed jobs, such as "/job:ps" which are not present.
            config._cluster_spec = server_lib.ClusterSpec({})

        classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=(sparse_feature, ),
            dnn_feature_columns=(embedding_feature, ),
            dnn_hidden_units=(3, 3),
            config=config)

        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=_ITERS).evaluate(input_fn=_input_fn,
                                                        steps=100)
        self._assertCommonMetrics(metrics)
Пример #10
0
  def setUp(self, num_workers, num_ps):
    super(BaseFaultToleranceTest, self).setUp()

    self._cluster = multi_worker_test_base.create_multi_process_cluster(
        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
    self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict()
    self._cluster_def["chief"] = [
        "localhost:%d" % multi_worker_test_base.pick_unused_port()
    ]
    cluster_resolver = SimpleClusterResolver(
        server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc")

    # The strategy's constructor would connect to the cluster.
    self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
        cluster_resolver)
    self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)

    self.thread_coord = thread_coordinator.Coordinator(
        clean_stop_exception_types=[])
    self.num_workers = num_workers
    self.num_ps = num_ps
Пример #11
0
def normalize_cluster_spec(cluster_spec):
    """Makes `cluster_spec` into a `ClusterSpec` object.

  Args:
    cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
      cluster configurations.

  Returns:
    a `ClusterSpec` object.

  Raises:
    ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a
      `ClusterDef`.
  """
    if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
        return server_lib.ClusterSpec(cluster_spec)
    elif not isinstance(cluster_spec, server_lib.ClusterSpec):
        raise ValueError(
            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
            "`tf.train.ClusterDef` object")
    return cluster_spec
Пример #12
0
def create_local_cluster(num_workers, num_ps, protocol="grpc"):
  """Create local GRPC servers and return them."""
  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
  cluster_dict = {
      "worker": ["localhost:%s" % port for port in worker_ports],
      "ps": ["localhost:%s" % port for port in ps_ports]
  }
  cs = server_lib.ClusterSpec(cluster_dict)

  workers = [
      server_lib.Server(
          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
      for ix in range(num_workers)
  ]
  ps_servers = [
      server_lib.Server(
          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
      for ix in range(num_ps)
  ]

  return cluster_dict, workers, ps_servers
Пример #13
0
  def testSetConfiguration(self):
    config = config_pb2.ConfigProto(
        gpu_options=config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.1))

    # Configure a server using the default local server options.
    server = server_lib.Server.create_local_server(config=config, start=False)
    self.assertEqual(0.1, server.server_def.default_session_config.gpu_options.
                     per_process_gpu_memory_fraction)

    # Configure a server using an explicit ServerDefd with an
    # overridden config.
    cluster_def = server_lib.ClusterSpec({
        "localhost": ["localhost:0"]
    }).as_cluster_def()
    server_def = tensorflow_server_pb2.ServerDef(
        cluster=cluster_def,
        job_name="localhost",
        task_index=0,
        protocol="grpc")
    server = server_lib.Server(server_def, config=config, start=False)
    self.assertEqual(0.1, server.server_def.default_session_config.gpu_options.
                     per_process_gpu_memory_fraction)
Пример #14
0
    def _setupCluster(self):
        def get_open_port():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            except IOError:
                s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
            s.bind(("", 0))
            port = s.getsockname()[1]
            s.close()
            return port

        port1 = get_open_port()
        port2 = get_open_port()
        cs = server_lib.ClusterSpec({
            "worker": ["localhost:%s" % port1],
            "ps": ["localhost:%s" % port2]
        })

        worker = server_lib.Server(cs, job_name="worker", start=True)
        ps = server_lib.Server(cs, job_name="ps", start=True)

        return worker, ps
Пример #15
0
 def testEq(self):
   self.assertEquals(server_lib.ClusterSpec({}), server_lib.ClusterSpec({}))
   self.assertEquals(
       server_lib.ClusterSpec({
           "job": ["host:2222"]
       }),
       server_lib.ClusterSpec({
           "job": ["host:2222"]
       }),)
   self.assertEquals(
       server_lib.ClusterSpec({
           "job": {
               0: "host:2222"
           }
       }), server_lib.ClusterSpec({
           "job": ["host:2222"]
       }))
Пример #16
0
def connect_to_remote_host(remote_host=None, job_name="worker"):
    """Connects to a single machine to enable remote execution on it.

  Will make devices on the remote host available to use. Note that calling this
  more than once will work, but will invalidate any tensor handles on the old
  remote devices.

  Using the default job_name of worker, you can schedule ops to run remotely as
  follows:
  ```python
  # Enable eager execution, and connect to the remote host.
  tf.compat.v1.enable_eager_execution()
  tf.contrib.eager.connect_to_remote_host("exampleaddr.com:9876")

  with ops.device("job:worker/replica:0/task:1/device:CPU:0"):
    # The following tensors should be resident on the remote device, and the op
    # will also execute remotely.
    x1 = array_ops.ones([2, 2])
    x2 = array_ops.ones([2, 2])
    y = math_ops.matmul(x1, x2)
  ```

  Args:
    remote_host: a single or a list the remote server addr in host-port format.
    job_name: The job name under which the new server will be accessible.

  Raises:
    ValueError: if remote_host is None.
  """
    if not remote_host:
        raise ValueError("Must provide at least one remote_host")

    remote_hosts = nest.flatten(remote_host)
    cluster_spec = server_lib.ClusterSpec({
        job_name: [_strip_prefix(host, _GRPC_PREFIX) for host in remote_hosts]
    })

    connect_to_cluster(cluster_spec)
Пример #17
0
    def testSerialize(self):
        # pylint: disable=g-import-not-at-top
        try:
            import portpicker
        except ImportError:
            return
        with context.graph_mode():
            worker_port = portpicker.pick_unused_port()
            ps_port = portpicker.pick_unused_port()
            cluster_dict = {
                "worker": ["localhost:%s" % worker_port],
                "ps": ["localhost:%s" % ps_port]
            }
            cs = server_lib.ClusterSpec(cluster_dict)

            worker = server_lib.Server(cs,
                                       job_name="worker",
                                       protocol="grpc",
                                       task_index=0,
                                       start=True)
            unused_ps = server_lib.Server(cs,
                                          job_name="ps",
                                          protocol="grpc",
                                          task_index=0,
                                          start=True)
            with ops.Graph().as_default(), session.Session(
                    target=worker.target):
                with ops.device("/job:worker"):
                    t = constant_op.constant([[1.0], [2.0]])
                    l = list_ops.tensor_list_from_tensor(t, element_shape=[1])
                with ops.device("/job:ps"):
                    l_ps = array_ops.identity(l)
                    l_ps, e = list_ops.tensor_list_pop_back(
                        l_ps, element_dtype=dtypes.float32)
                with ops.device("/job:worker"):
                    worker_e = array_ops.identity(e)
                self.assertAllEqual(self.evaluate(worker_e), [2.0])
Пример #18
0
  def test_explicitly_specified_values(self):
    cluster_spec = {
        run_config_lib.TaskType.PS: ["localhost:9990"],
        "my_job_name": ["localhost:9991", "localhost:9992", "localhost:0"]
    }
    tf_config = {
        "cluster": cluster_spec,
        "task": {
            "type": run_config_lib.TaskType.WORKER,
            "index": 2
        }
    }
    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
      config = run_config_lib.RunConfig(
          master="localhost:0", evaluation_master="localhost:9991")

    self.assertEqual(config.master, "localhost:0")
    self.assertEqual(config.task_id, 2)
    self.assertEqual(config.num_ps_replicas, 1)
    self.assertEqual(config.num_worker_replicas, 0)
    self.assertEqual(config.cluster_spec, server_lib.ClusterSpec(cluster_spec))
    self.assertEqual(config.task_type, run_config_lib.TaskType.WORKER)
    self.assertFalse(config.is_chief)
    self.assertEqual(config.evaluation_master, "localhost:9991")
Пример #19
0
    def cluster_spec(self):
        """Returns a ClusterSpec object based on the latest TPU information.

    We retrieve the information from the GCE APIs every time this method is
    called.

    Returns:
      A ClusterSpec containing host information returned from Cloud TPUs.

    Raises:
      RuntimeError: If the provided TPU is not healthy.
    """
        ############################################################################
        # There are 5 potential cases this code must handle:
        #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
        #      a. Create a ClusterSpec that includes the coordinator job
        #      b. Create a ClusterSpec without the coordinator job.
        #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
        #     tasks and
        #      a. Create a ClusterSpec with the coordinator
        #      b. Create a ClusterSpec without the coordinator
        #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
        ############################################################################

        if self._should_resolve():
            # Case 1.
            response = self._fetch_cloud_tpu_metadata()  # pylint: disable=protected-access

            if 'state' in response and response['state'] != 'READY':
                raise RuntimeError(
                    'TPU "%s" is not yet ready; state: "%s"' %
                    (compat.as_text(self._tpu), response['state']))

            if 'networkEndpoints' in response:
                worker_list = [
                    '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
                    for endpoint in response['networkEndpoints']
                ]
            else:
                # Fall back to the deprecated response format
                instance_url = '%s:%s' % (response['ipAddress'],
                                          response['port'])
                worker_list = [instance_url]

            cluster_spec = {self.task_type: worker_list}
        else:
            if self.rpc_layer is None:
                # Case 3.
                return None
            # Case 2.
            tpus = []
            for tpu in compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR):
                # We are working around the fact that GKE environment variable that is
                # supplied to us has the protocol string embedded in it, but we want
                # to strip it out for the ClusterSpec.
                if (self.rpc_layer is not None
                        and tpu.startswith(self.rpc_layer + '://')):
                    tpus.append(tpu[len(self.rpc_layer + '://'):])
                else:
                    tpus.append(tpu)
            cluster_spec = {self.task_type: tpus}

        if self._coordinator_address:
            # {1, 2}.a
            cluster_spec[self._coordinator_name] = [self._coordinator_address]

        return server_lib.ClusterSpec(cluster_spec)
Пример #20
0
def replica_device_setter(ps_tasks=0,
                          ps_device="/job:ps",
                          worker_device="/job:worker",
                          merge_devices=True,
                          cluster=None,
                          ps_ops=None,
                          ps_strategy=None):
    """Return a `device function` to use when building a Graph for replicas.

  Device Functions are used in `with tf.device(device_function):` statement to
  automatically assign devices to `Operation` objects as they are constructed,
  Device constraints are added from the inner-most context first, working
  outwards. The merging behavior adds constraints to fields that are yet unset
  by a more inner context. Currently the fields are (job, task, cpu/gpu).

  If `cluster` is `None`, and `ps_tasks` is 0, the returned function is a no-op.
  Otherwise, the value of `ps_tasks` is derived from `cluster`.

  By default, only Variable ops are placed on ps tasks, and the placement
  strategy is round-robin over all ps tasks. A custom `ps_strategy` may be used
  to do more intelligent placement, such as
  `tf.contrib.training.GreedyLoadBalancingStrategy`.

  For example,

  ```python
  # To build a cluster with two ps jobs on hosts ps0 and ps1, and 3 worker
  # jobs on hosts worker0, worker1 and worker2.
  cluster_spec = {
      "ps": ["ps0:2222", "ps1:2222"],
      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
  with tf.device(tf.replica_device_setter(cluster=cluster_spec)):
    # Build your graph
    v1 = tf.Variable(...)  # assigned to /job:ps/task:0
    v2 = tf.Variable(...)  # assigned to /job:ps/task:1
    v3 = tf.Variable(...)  # assigned to /job:ps/task:0
  # Run compute
  ```

  Args:
    ps_tasks: Number of tasks in the `ps` job.  Ignored if `cluster` is
      provided.
    ps_device: String.  Device of the `ps` job.  If empty no `ps` job is used.
      Defaults to `ps`.
    worker_device: String.  Device of the `worker` job.  If empty no `worker`
      job is used.
    merge_devices: `Boolean`. If `True`, merges or only sets a device if the
      device constraint is completely unset. merges device specification rather
      than overriding them.
    cluster: `ClusterDef` proto or `ClusterSpec`.
    ps_ops: List of strings representing `Operation` types that need to be
      placed on `ps` devices.  If `None`, defaults to `["Variable"]`.
    ps_strategy: A callable invoked for every ps `Operation` (i.e. matched by
      `ps_ops`), that takes the `Operation` and returns the ps task index to
      use.  If `None`, defaults to a round-robin strategy across all `ps`
      devices.

  Returns:
    A function to pass to `tf.device()`.

  Raises:
    TypeError if `cluster` is not a dictionary or `ClusterDef` protocol buffer,
    or if `ps_strategy` is provided but not a callable.
  """
    if cluster is not None:
        if isinstance(cluster, server_lib.ClusterSpec):
            cluster_spec = cluster.as_dict()
        else:
            cluster_spec = server_lib.ClusterSpec(cluster).as_dict()
        # Get ps_job_name from ps_device by striping "/job:".
        ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
        if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None:
            return None
        ps_tasks = len(cluster_spec[ps_job_name])

    if ps_tasks == 0:
        return None

    if ps_ops is None:
        # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be
        # placed in the parameter server.
        ps_ops = ["Variable"]

    if not merge_devices:
        logging.warning(
            "DEPRECATION: It is recommended to set merge_devices=true in "
            "replica_device_setter")
    if ps_strategy is None:
        ps_strategy = _RoundRobinStrategy(ps_tasks)
    if not six.callable(ps_strategy):
        raise TypeError("ps_strategy must be callable")
    chooser = _ReplicaDeviceChooser(ps_tasks, ps_device, worker_device,
                                    merge_devices, ps_ops, ps_strategy)
    return chooser.device_function
Пример #21
0
    def cluster_spec(self):
        """Returns a ClusterSpec object based on the latest TPU information.

    We retrieve the information from the GCE APIs every time this method is
    called.

    Returns:
      A ClusterSpec containing host information returned from Cloud TPUs.

    Raises:
      RuntimeError: If the provided TPU is not healthy.
    """
        ############################################################################
        # There are 5 potential cases this code must handle:
        #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
        #      a. Create a ClusterSpec that includes the coordinator job
        #      b. Create a ClusterSpec without the coordinator job.
        #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
        #     tasks and
        #      a. Create a ClusterSpec with the coordinator
        #      b. Create a ClusterSpec without the coordinator
        #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
        ############################################################################

        if self._shouldResolve():
            # Case 1.
            full_name = 'projects/%s/locations/%s/nodes/%s' % (
                self._project, self._zone, compat.as_text(self._tpu))
            request = self._service.projects().locations().nodes().get(
                name=full_name)
            response = request.execute()

            if 'state' in response and response['state'] != 'READY':
                raise RuntimeError(
                    'TPU "%s" is not yet ready; state: "%s"' %
                    (compat.as_text(self._tpu), response['state']))

            if 'health' in response and response['health'] != 'HEALTHY':
                raise RuntimeError(
                    'TPU "%s" is unhealthy: "%s"' %
                    (compat.as_text(self._tpu), response['health']))

            if 'networkEndpoints' in response:
                worker_list = [
                    '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
                    for endpoint in response['networkEndpoints']
                ]
            else:
                # Fall back to the deprecated response format
                instance_url = '%s:%s' % (response['ipAddress'],
                                          response['port'])
                worker_list = [instance_url]

            cluster_spec = {self._job_name: worker_list}
        else:
            if not self._tpu.startswith(compat.as_bytes('grpc://')):
                # Case 3.
                return None
            # Case 2.
            cluster_spec = {
                self._job_name: [
                    x[len(compat.as_bytes('grpc://')):]
                    for x in self._tpu.split(
                        compat.as_bytes(_ENDPOINTS_SEPARATOR))
                ]
            }

        if self._coordinator_address:
            # {1, 2}.a
            cluster_spec[self._coordinator_name] = [self._coordinator_address]

        return server_lib.ClusterSpec(cluster_spec)
Пример #22
0
    def _init_distributed_setting_from_environment_var(self, tf_config):
        """Initialize distributed properties based on `tf_config`."""

        self._service = _validate_service(tf_config.get(_SERVICE_KEY))
        self._cluster_spec = server_lib.ClusterSpec(
            tf_config.get(_CLUSTER_KEY, {}))
        task_env = tf_config.get(_TASK_ENV_KEY, {})

        if self._cluster_spec and TaskType.MASTER in self._cluster_spec.jobs:
            return self._init_distributed_setting_from_environment_var_with_master(
                tf_config)

        if self._cluster_spec:
            # Distributed mode.
            self._task_type, self._task_id = _validate_task_type_and_task_id(
                self._cluster_spec, task_env, TaskType.CHIEF)

            self._evaluation_master = _get_eval_session_master(
                self._task_type, tf_config)

            if self._task_type != TaskType.EVALUATOR:
                self._master = _get_session_master(self._cluster_spec,
                                                   self._task_type,
                                                   self._task_id, tf_config)
                self._num_ps_replicas = _count_ps(self._cluster_spec)
                self._num_worker_replicas = _count_worker(
                    self._cluster_spec, chief_task_type=TaskType.CHIEF)
                self._global_id_in_cluster = _get_global_id_in_cluster(
                    self._cluster_spec,
                    self._task_type,
                    self._task_id,
                    chief_task_type=TaskType.CHIEF)
            else:
                # Evaluator is not part of the training cluster.
                self._cluster_spec = server_lib.ClusterSpec({})
                self._master = _LOCAL_MASTER
                self._num_ps_replicas = 0
                self._num_worker_replicas = 0
                self._global_id_in_cluster = None  # undefined

            self._is_chief = self._task_type == TaskType.CHIEF
        else:
            # Local mode.
            self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER)
            self._task_id = int(task_env.get(_TASK_ID_KEY, 0))
            self._global_id_in_cluster = 0

            if self._task_type != TaskType.WORKER:
                raise ValueError(
                    'If "cluster" is not set in TF_CONFIG, task type must be WORKER.'
                )
            if self._task_id != 0:
                raise ValueError(
                    'If "cluster" is not set in TF_CONFIG, task index must be 0.'
                )

            self._master = tf_config.get(_SESSION_MASTER_KEY, _LOCAL_MASTER)
            self._evaluation_master = tf_config.get(_EVAL_SESSION_MASTER_KEY,
                                                    _LOCAL_MASTER)
            self._is_chief = True
            self._num_ps_replicas = 0
            self._num_worker_replicas = 1
Пример #23
0
def _create_cluster(num_workers,
                    num_ps,
                    has_chief=False,
                    has_eval=False,
                    protocol='grpc',
                    worker_config=None,
                    ps_config=None,
                    eval_config=None,
                    worker_name='worker',
                    ps_name='ps',
                    chief_name='chief'):
    """Creates and starts local servers and returns the cluster_spec dict."""
    if _portpicker_import_error:
        raise _portpicker_import_error  # pylint: disable=raising-bad-type
    worker_ports = [pick_unused_port() for _ in range(num_workers)]
    ps_ports = [pick_unused_port() for _ in range(num_ps)]

    cluster_dict = {}
    if num_workers > 0:
        cluster_dict[worker_name] = [
            'localhost:%s' % port for port in worker_ports
        ]
    if num_ps > 0:
        cluster_dict[ps_name] = ['localhost:%s' % port for port in ps_ports]
    if has_eval:
        cluster_dict['evaluator'] = ['localhost:%s' % pick_unused_port()]
    if has_chief:
        cluster_dict[chief_name] = ['localhost:%s' % pick_unused_port()]

    cs = server_lib.ClusterSpec(cluster_dict)

    for i in range(num_workers):
        server_lib.Server(cs,
                          job_name=worker_name,
                          protocol=protocol,
                          task_index=i,
                          config=worker_config,
                          start=True)

    for i in range(num_ps):
        server_lib.Server(cs,
                          job_name=ps_name,
                          protocol=protocol,
                          task_index=i,
                          config=ps_config,
                          start=True)

    if has_chief:
        server_lib.Server(cs,
                          job_name=chief_name,
                          protocol=protocol,
                          task_index=0,
                          config=worker_config,
                          start=True)

    if has_eval:
        server_lib.Server(cs,
                          job_name='evaluator',
                          protocol=protocol,
                          task_index=0,
                          config=eval_config,
                          start=True)

    return cluster_dict
 def testNonEmptyClusterSpecIsTrue(self):
     self.assertTrue(server_lib.ClusterSpec({"job": ["host:port"]}))
 def _get_distribution_strategy(self):
   return multi_worker_strategy.MultiWorkerMirroredStrategy(
       cluster=server_lib.ClusterSpec({
           'worker': ['/job:worker/task:0', '/job:worker/task:1']
       }),
       num_gpus_per_worker=context.num_gpus())
Пример #26
0
 def _configure_distribution_strategy(self, distribution):
   cluster_spec = server_lib.ClusterSpec({
       "worker": ["/job:worker/task:0", "/job:worker/task:1"]
   })
   distribution.configure(cluster_spec=cluster_spec)
Пример #27
0
  def __init__(self,
               num_gpus_per_worker=1,
               worker_job_name=None,
               num_workers=None,
               cluster=None,
               cross_tower_ops=None,
               prefetch_on_device=None):
    """Initialize the strategy object.

    Args:
      num_gpus_per_worker: number of GPUs per work. If it is zero, the local
        CPU will be used.
      worker_job_name: the job name for `worker`, typically just 'worker'.
      num_workers: the number of workers. If it is 0, it regenerates to
        single-worker MirroredStrategy.
      cluster: a `tf.train.ClusterSpec` object or a dict that can be used to
        construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef`
        proto buffer. It is an alternative way to initialize this object.
      cross_tower_ops: the cross tower ops to use. If None, a default one will
        be used. If configure method is called, a best one for the configuration
        will be chosen.
      prefetch_on_device: a boolean to specify whether to prefetech input to
        each worker's devices.

    Raises:
      ValueError: if got an unexpected `cluster`.
    """
    if cluster is None:
      self._workers = [
          '/job:%s/task:%d' % (worker_job_name, task_index)
          for task_index in range(num_workers)
      ]
    else:
      if isinstance(cluster, (dict, cluster_pb2.ClusterDef)):
        cluster_spec = server_lib.ClusterSpec(cluster)
      elif isinstance(cluster, server_lib.ClusterSpec):
        cluster_spec = cluster
      else:
        raise ValueError(
            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
            '`tf.train.ClusterDef` object')

      self._workers = []
      for job in sorted(cluster_spec.jobs):
        for task in range(cluster_spec.num_tasks(job)):
          self._workers.append('/job:%s/task:%d' % (job, task))

    self._num_gpus_per_worker = num_gpus_per_worker
    if num_gpus_per_worker > 0:
      self._worker_device_map = {
          worker: [
              device_util.canonicalize(worker + '/device:GPU:%d' % gpu)
              for gpu in range(num_gpus_per_worker)
          ] for worker in self._workers
      }
    else:
      self._worker_device_map = {
          worker: [device_util.canonicalize(worker, '/device:CPU:0')]
          for worker in self._workers
      }
    self._devices = nest.flatten(self._worker_device_map)

    super(MultiWorkerMirroredStrategy, self).__init__(
        devices=self._devices, prefetch_on_device=prefetch_on_device)

    # Setting `_default_device` will add a device scope in the
    # distribution.scope. We set the default device to the first worker. When
    # users specify device under distribution.scope by
    #   with tf.device("/cpu:0"):
    #     ...
    # their ops will end up on the cpu device of its first worker, e.g.
    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
    self._default_device = self._workers[0]
Пример #28
0
    def __init__(self, master=None, evaluation_master=None):
        """Constructor.

    Sets the properties `cluster_spec`, `is_chief`, `master` (if `None` in the
    args), `num_ps_replicas`, `task_id`, and `task_type` based on the
    `TF_CONFIG` environment variable, if the pertinent information is
    present. The `TF_CONFIG` environment variable is a JSON object with
    attributes: `cluster`, `environment`, and `task`.

    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
    `server_lib.py`, mapping task types (usually one of the TaskType enums) to a
    list of task addresses.

    `environment` specifies the runtime environment for the job (usually one of
    the `Environment` enums). Defaults to `LOCAL`.

    `task` has two attributes: `type` and `index`, where `type` can be any of
    the task types in `cluster`. When `TF_CONFIG` contains said information, the
    following properties are set on this class:

    * `task_type` is set to `TF_CONFIG['task']['type']`. Defaults to `None`.
    * `task_id` is set to `TF_CONFIG['task']['index']`. Defaults to 0.
    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}.
    * `master` is determined by looking up `task_type` and `task_id` in the
      `cluster_spec`. Defaults to ''.
    * `num_ps_replicas` is set by counting the number of nodes listed
      in the `ps` attribute of `cluster_spec`. Defaults to 0.
    * `num_worker_replicas` is set by counting the number of nodes listed
      in the `worker` attribute of `cluster_spec`. Defaults to 0.
    * `is_chief` is deteremined based on `task_type`, `type_id`, and
      `environment`.

    Example:
    ```
      cluster = {'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'worker', 'index': 1}})
      config = ClusterConfig()
      assert config.master == 'host4:2222'
      assert config.task_id == 1
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 3
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'worker'
      assert not config.is_chief
    ```

    Args:
      master: TensorFlow master. Defaults to empty string for local.
      evaluation_master: The master on which to perform evaluation.
    """
        # If not explicitly specified in the constructor and the TF_CONFIG
        # environment variable is present, load cluster_spec from TF_CONFIG.
        config = json.loads(os.environ.get('TF_CONFIG') or '{}')

        # Set task_type and task_id if the TF_CONFIG environment variable is
        # present.  Otherwise, use the respective default (None / 0).
        task_env = config.get('task', {})
        self._task_type = task_env.get('type', None)
        self._task_id = self.get_task_id()

        self._cluster_spec = server_lib.ClusterSpec(config.get('cluster', {}))
        self._master = (master if master is not None else _get_master(
            self._cluster_spec, self._task_type, self._task_id) or '')
        self._num_ps_replicas = _count_ps(self._cluster_spec) or 0
        self._num_worker_replicas = _count_worker(self._cluster_spec) or 0

        # Set is_chief.
        self._environment = config.get('environment', Environment.LOCAL)
        self._is_chief = None
        if self._task_type is None:
            self._is_chief = (self._task_id == 0)
        elif self._environment == Environment.CLOUD:
            # When the TF_CONFIG environment variable is set, we can set the
            # default of is_chief to 0 when task_type is "master" and task_id is 0.
            self._is_chief = (self._task_type == TaskType.MASTER
                              and self._task_id == 0)
        else:
            # Legacy behavior is that is_chief is None if task_id == 0.
            self._is_chief = (self._task_type == TaskType.WORKER
                              and self._task_id == 0)

        self._evaluation_master = evaluation_master or ''
Пример #29
0
def create_local_cluster(num_workers,
                         num_ps,
                         protocol="grpc",
                         worker_config=None,
                         ps_config=None):
    """Create and start local servers and return the associated `Server` objects.

  Example:
  ```python
  workers, _ = tf.test.create_local_cluster(num_workers=2, num_ps=2)

  worker_sessions = [tf.Session(w.target) for w in workers]

  with tf.device("/job:ps/task:0"):
    ...
  with tf.device("/job:ps/task:1"):
    ...
  with tf.device("/job:worker/task:0"):
    ...
  with tf.device("/job:worker/task:1"):
    ...

  worker_sessions[0].run(...)
  ```

  Args:
    num_workers: Number of worker servers to start.
    num_ps: Number of PS servers to start.
    protocol: Communication protocol.  Allowed values are documented in
      the documentation of `tf.train.Server`.
    worker_config: (optional) ConfigProto to initialize workers. Can be used
      to instantiate multiple devices etc.
    ps_config: (optional) ConfigProto to initialize PS servers.

  Returns:
    A tuple `(worker_servers, ps_servers)`.  `worker_servers` is a list
    of `num_workers` objects of type `tf.train.Server` (all running locally);
    and `ps_servers` is a list of `num_ps` objects of similar type.

  Raises:
    ImportError: if portpicker module was not found at load time
  """
    if _portpicker_import_error:
        raise _portpicker_import_error  # pylint: disable=raising-bad-type
    worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
    ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
    cluster_dict = {
        "worker": ["localhost:%s" % port for port in worker_ports],
        "ps": ["localhost:%s" % port for port in ps_ports]
    }
    cs = server_lib.ClusterSpec(cluster_dict)

    workers = [
        server_lib.Server(cs,
                          job_name="worker",
                          protocol=protocol,
                          task_index=ix,
                          config=worker_config,
                          start=True) for ix in range(num_workers)
    ]
    ps_servers = [
        server_lib.Server(cs,
                          job_name="ps",
                          protocol=protocol,
                          task_index=ix,
                          config=ps_config,
                          start=True) for ix in range(num_ps)
    ]

    return workers, ps_servers
Пример #30
0
    def __init__(self,
                 iterations_per_loop=1,
                 profiling_config=None,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=0,
                 save_checkpoints_steps=None,
                 save_checkpoints_secs=None,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 distribute=None,
                 enable_data_pre_proc=True,
                 precision_mode=None,
                 enable_reduce_precision=False,
                 variable_format_optimize=True,
                 mix_compile_mode=False,
                 hcom_parallel=False,
                 graph_memory_max_size=None,
                 variable_memory_max_size=None,
                 auto_tune_mode=None,
                 dump_config=None,
                 stream_max_parallel_num=None,
                 is_tailing_optimization=False,
                 horovod_mode=False,
                 graph_run_mode=1,
                 op_debug_level=0,
                 enable_scope_fusion_passes=None,
                 enable_exception_dump=0,
                 op_select_implmode=None,
                 optypelist_for_implmode=None,
                 dynamic_input_config=None,
                 mstune_mode=None,
                 work_path=None,
                 buffer_optimize="l2_optimize",
                 enable_small_channel=0,
                 fusion_switch_file=None,
                 enable_compress_weight=False,
                 compress_weight_conf=None,
                 op_compiler_cache_mode=None,
                 op_compiler_cache_dir=None,
                 debug_dir=None,
                 hcom_multi_mode=False,
                 dynamic_input=False,
                 dynamic_graph_execute_mode="dynamic_execute",
                 dynamic_inputs_shape_range=None,
                 train_distribute=None,
                 eval_distribute=None,
                 local_rank_id=None,
                 local_device_list=None,
                 session_device_id=None,
                 distribute_config=None,
                 op_tune_mode=None):
        """
        Constructs a NPUConfig.

        Args:
        iterations_per_loop: This is the number of train steps running in NPU
            system before returning to CPU host for each `Session.run`. This means
            global step is increased `iterations_per_loop` times in one `Session.run`.
            It is recommended to be set as number of global steps for next checkpoint.
        profiling_config: The profiling configuration.
        model_dir: Directory where model parameters, graph, etc are saved. If
            `PathLike` object, the path will be resolved. If `None`, will use a
            default value set by the Estimator.
        tf_random_seed: Random seed for TensorFlow initializers.
            Setting this value allows consistency between reruns.
        save_summary_steps: Save summaries every this many steps.
        save_checkpoints_steps: Save checkpoints every this many steps. Can not be
            specified with `save_checkpoints_secs`.
        save_checkpoints_secs: Save checkpoints every this many seconds. Can not
            be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
            both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
            in constructor.  If both `save_checkpoints_steps` and
            `save_checkpoints_secs` are None, then checkpoints are disabled.
        session_config: A ConfigProto used to set session parameters, or None.
        keep_checkpoint_max: The maximum number of recent checkpoint files to
            keep. As new files are created, older files are deleted. If None or 0,
            all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
            checkpoint files are kept.)
        keep_checkpoint_every_n_hours: Number of hours between each checkpoint
            to be saved. The default value of 10,000 hours effectively disables
            the feature.
        log_step_count_steps: The frequency, in number of global steps, that the
            global step/sec and the loss will be logged during training.
        enabel_data_pre_proc: This is the switch of data preprocess.
        precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16.
        variable_format_optimize: enable or disable variable format optimize while graph
            engineer optimize process.
        mix_compile_mode: This is the swith of mix_compile_mode. When the value is
            False, all graphs run on device. Otherwise, some graphs run on host.
        hcom_parallel: This is the switch of hcom parallel. When the value is True,
            hcom will execute with parallel mode. Otherwise, hcom will execute with
            serialize mode.
        graph_memory_max_size: The max size of ge graph memory size.
        variable_memory_max_size: The max size of ge variable memory size.
        auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL`
        dump_config: The dump configuration.
        stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine
                                 to achieve parallel execution between AICPU / AICORE operators.
        op_select_implmode: Selecting whether the operator is implemented with high precision
                            or high performance.
        optypelist_for_implmode: Operator list.
        dynamic_input_config:Dynamic dims configuration
        mstune_mode: Optimization Task Type."1": model tune; "2": optune;
                     "3": model tune & optune; "4": gradient split tune.
        work_path: Stores temporary files generated during optimization.
        buffer_optimize: Whether to enable buffer optimization.
        enable_small_channel: Whether to enable small channel optimization.
        fusion_switch_file: Fusion switch configuration file path.
        enable_compress_weight: Whether to enable global weight compression.
        compress_weight_conf:Path and file name of the node list configuration file to be compressed.
        dynamic_input:Whether Input is dynamic.
        dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute
        dynamic_inputs_shape_range:Inputs shape range.
        local_rank_id: Local sequence number of the device in a group.
        local_device_list: Available devices.
        distribute_config: Specify the NCA configuration file path
        op_tune_mode: None, or `GA` ,or `RL` or `GA|RL`, use with mstune_mode.
        """

        # Check iterations_per_loop.
        util.check_positive_integer(iterations_per_loop, "iterations_per_loop")
        if isinstance(mix_compile_mode, bool) == False:
            raise ValueError('"mix_compile_mode" type must be bool')
        if mix_compile_mode is True and iterations_per_loop != 1:
            raise ValueError(
                '"iterations_per_loop" must be 1 with "mix_compile_mode" is True'
            )
        tf_config = json.loads(
            os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}'))
        tmp_cluster_spec = server_lib.ClusterSpec(
            tf_config.get(run_config_lib._CLUSTER_KEY, {}))
        if ((tmp_cluster_spec
             and not isinstance(distribute, ParameterServerStrategy))
                or (not tmp_cluster_spec
                    and isinstance(distribute, ParameterServerStrategy))):
            raise ValueError(
                '"cluster" and "distribute" must all be set in ps mode')
        if tmp_cluster_spec and mix_compile_mode is False:
            raise ValueError(
                '"mix_compile_mode" can only be True with "cluster" is set')

        self.iterations_per_loop = iterations_per_loop
        self.mix_compile_mode = mix_compile_mode
        self.enable_data_pre_proc = enable_data_pre_proc
        self.is_tailing_optimization = is_tailing_optimization
        if save_checkpoints_secs == None and save_checkpoints_steps == None:
            save_checkpoints_steps = 100

        self._profiling_config = profiling_config

        # mix precision configuration
        self._precision_mode = precision_mode
        self._enable_reduce_precision = enable_reduce_precision
        self._variable_format_optimize = variable_format_optimize
        self._hcom_parallel = hcom_parallel
        self._graph_memory_max_size = graph_memory_max_size
        self._variable_memory_max_size = variable_memory_max_size

        self._auto_tune_mode = auto_tune_mode

        if dump_config is not None and not isinstance(dump_config, DumpConfig):
            raise ValueError(
                '`dump_config` must be provided with type `DumpConfig`')
        self._dump_config = dump_config
        self._stream_max_parallel_num = stream_max_parallel_num

        if isinstance(horovod_mode, bool) == False:
            raise ValueError('"horovod_mode" type must be bool')
        self.horovod_mode = horovod_mode
        util.check_nonnegative_integer(graph_run_mode, "graph_run_mode")
        if graph_run_mode > 1:
            raise ValueError('"graph_run_mode" value must be 0 or 1')
        self.graph_run_mode = graph_run_mode
        self.op_debug_level = op_debug_level
        self.enable_scope_fusion_passes = enable_scope_fusion_passes
        experimental_distribute = None
        if tmp_cluster_spec and isinstance(distribute,
                                           ParameterServerStrategy):
            experimental_distribute = DistributeConfig(distribute, distribute,
                                                       None)
        util.check_nonnegative_integer(enable_exception_dump,
                                       "enable_exception_dump")
        self.enable_exception_dump = enable_exception_dump
        self._op_select_implmode = op_select_implmode
        self._optypelist_for_implmode = optypelist_for_implmode
        if dynamic_input_config is not None and not isinstance(
                dynamic_input_config, DynamicInputConfig):
            raise ValueError(
                'dynamic_input_config must be provided with type DynamicInputConfig'
            )
        self._dynamic_input_config = dynamic_input_config
        self._mstune_mode = mstune_mode
        self._work_path = work_path
        self._buffer_optimize = buffer_optimize
        self._enable_small_channel = enable_small_channel
        self._fusion_switch_file = fusion_switch_file
        self._enable_compress_weight = enable_compress_weight
        self._compress_weight_conf = compress_weight_conf
        self._op_compiler_cache_mode = op_compiler_cache_mode
        self._op_compiler_cache_dir = op_compiler_cache_dir
        self._debug_dir = debug_dir
        self._hcom_multi_mode = hcom_multi_mode
        self._dynamic_input = dynamic_input
        self._dynamic_graph_execute_mode = dynamic_graph_execute_mode
        self._dynamic_inputs_shape_range = dynamic_inputs_shape_range
        self._local_rank_id = local_rank_id
        self._local_device_list = local_device_list
        self._session_device_id = session_device_id
        self._distribute_config = distribute_config
        self._op_tune_mode = op_tune_mode

        super(NPURunConfig, self).__init__(
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            experimental_distribute=experimental_distribute,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute)