def _create_tpu_strategy():
    resolver = tpu_cluster_resolver.TPUClusterResolver("")
    topology = tpu_strategy_util.initialize_tpu_system(resolver)
    device_assignment = None
    if use_single_core:
      device_assignment = device_assignment_lib.DeviceAssignment(
          topology, core_assignment=device_assignment_lib.
          SINGLE_CORE_ASSIGNMENT)

    # Steps per run is only supported in TF 1.x
    if tf2.enabled():
      return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
    else:
      return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                   device_assignment, **kwargs)
 def test_device_assignment_constants(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
     device_assignment = device_assignment_lib.DeviceAssignment(
         topology,
         core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)
     self.assertAllEqual([[[0, 0, 0, 0]]],
                         device_assignment.core_assignment)
     self.assertEqual(1, device_assignment.num_cores_per_replica)
     self.assertEqual(1, device_assignment.num_replicas)
     self.assertEqual("/task:0/device:TPU:0",
                      device_assignment.tpu_device())
     self.assertEqual("/task:0/device:CPU:0",
                      device_assignment.host_device())
예제 #3
0
    def _create_tpu_strategy():
        FLAGS = flags.FLAGS  # pylint: disable=invalid-name
        global _did_connect_to_cluster
        global _topology

        try:
            # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
            # which case we fall back to the values passed as flags.
            resolver = tpu_cluster_resolver.TPUClusterResolver()
            did_automatically_resolve = True
        except ValueError:
            did_automatically_resolve = False

            # These flags will be defined by tpu_test_wrapper.py.
            resolver = tpu_cluster_resolver.TPUClusterResolver(
                tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
                zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
                project=hasattr(FLAGS, "project") and FLAGS.project or None,
            )

        # Only connect once per process, rather than per test method.
        if not _did_connect_to_cluster:
            if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
                remote.connect_to_cluster(resolver)
                _did_connect_to_cluster = True
            _topology = tpu_strategy_util.initialize_tpu_system(resolver)

        device_assignment = None
        if use_single_core:
            device_assignment = device_assignment_lib.DeviceAssignment(
                _topology,
                core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)

        # Steps per run is only supported in TF 1.x
        if tf2.enabled():
            strategy = tpu_lib.TPUStrategyV2(
                resolver,
                device_assignment,
                experimental_spmd_xla_partitioning=enable_spmd_xla_paritioning,
                **kwargs)
        else:
            strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                             device_assignment, **kwargs)
        if enable_packed_variable and enable_spmd_xla_paritioning:
            raise ValueError(
                "Packed Variable is not compatiable with SPMD mode")
        strategy._enable_packed_variable_in_eager_mode = enable_packed_variable  # pylint: disable=protected-access
        return strategy
예제 #4
0
  def test_model_parallelism(self):
    resolver = get_tpu_cluster_resolver()
    remote.connect_to_cluster(resolver)
    topology = tpu_strategy_util.initialize_tpu_system(resolver)
    device_assignment = device_assignment_lib.DeviceAssignment(
        topology, core_assignment=[[[0, 0, 0, 0], [0, 0, 0, 1]]])
    strategy = tpu_lib.TPUStrategyV2(
        resolver,
        experimental_device_assignment=device_assignment)

    with strategy.scope():
      v = variables.Variable(2.)
      with strategy.extended.experimental_logical_device(1):
        w = variables.Variable(3.)

    self.assertLen(strategy.experimental_local_results(v), 1)
    self.assertLen(strategy.experimental_local_results(w), 1)
    self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:0",
                     strategy.experimental_local_results(v)[0].device)
    self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:1",
                     strategy.experimental_local_results(w)[0].device)

    logical_devices = []
    @def_function.function
    def f(x):
      replica_ctx = distribution_strategy_context.get_replica_context()
      with replica_ctx.experimental_logical_device(0):
        y = v * x
      with replica_ctx.experimental_logical_device(1):
        z = w * y
      logical_devices.append((y.device, z.device))
      return z

    result = strategy.run(f, args=(5.,))

    self.assertEqual(
        [("/device:TPU_REPLICATED_CORE:0", "/device:TPU_REPLICATED_CORE:1")],
        logical_devices)

    with self.cached_session():
      self.evaluate(variables.global_variables_initializer())
      self.assertEqual(30., self.evaluate(result))
예제 #5
0
  def test_computation_on_subset_cores(self, enable_packed_var):
    resolver = get_tpu_cluster_resolver()
    remote.connect_to_cluster(resolver)
    topology = tpu_strategy_util.initialize_tpu_system(resolver)
    all_core_strategy = tpu_lib.TPUStrategyV2(resolver)
    all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var

    with all_core_strategy.scope():
      v = variables.Variable(0.0,
                             aggregation=variables.VariableAggregation.MEAN)

    # Computation on the 1st core.
    device_assignment = device_assignment_lib.DeviceAssignment.build(
        topology, num_replicas=1)
    first_core_strategy = tpu_lib.TPUStrategyV2(
        resolver, experimental_device_assignment=device_assignment)
    first_core_strategy._enable_packed_variable_in_eager_mode = (
        enable_packed_var)

    # Computation on the 2nd core.
    device_assignment2 = device_assignment_lib.DeviceAssignment(
        topology, [[[0, 0, 0, 1]]])
    second_core_strategy = tpu_lib.TPUStrategyV2(
        resolver, experimental_device_assignment=device_assignment2)
    second_core_strategy._enable_packed_variable_in_eager_mode = (
        enable_packed_var)

    @def_function.function
    def train_step():

      def step_fn():
        return v + 1.0

      all_core_strategy.run(step_fn)
      r1 = first_core_strategy.run(step_fn)
      r2 = second_core_strategy.run(step_fn)
      return r1 + r2

    train_step()
    self.assertAllEqual(2., train_step())
예제 #6
0
  def test_variables_mismatched_device_assignment(self):
    resolver = get_tpu_cluster_resolver()
    remote.connect_to_cluster(resolver)
    topology = tpu_strategy_util.initialize_tpu_system(resolver)

    strategy0 = tpu_lib.TPUStrategyV2(resolver)
    self.assertEqual(
        ("/job:localhost/replica:0/task:0/device:TPU:0",
         "/job:localhost/replica:0/task:0/device:TPU:1"),
        strategy0.extended.worker_devices)

    with strategy0.scope():
      v = variables.Variable(1.)

    v1_assign_op = strategy0.experimental_local_results(v)[1].assign(42.)

    with self.cached_session():
      self.evaluate(variables.global_variables_initializer())
      self.evaluate(v1_assign_op)
      self.assertAllEqual([1., 42.],
                          self.evaluate(
                              strategy0.experimental_local_results(v)))

    # Second strategy has devices reversed relative to the first.
    device_assignment = device_assignment_lib.DeviceAssignment(
        topology, core_assignment=[[[0, 0, 0, 1]], [[0, 0, 0, 0]]])
    strategy1 = tpu_lib.TPUStrategyV2(
        resolver,
        experimental_device_assignment=device_assignment)
    self.assertEqual(
        ("/job:localhost/replica:0/task:0/device:TPU:1",
         "/job:localhost/replica:0/task:0/device:TPU:0"),
        strategy1.extended.worker_devices)

    v_read = strategy1.run(def_function.function(v.read_value))

    with self.cached_session():
      self.assertAllEqual([42., 1.],
                          self.evaluate(
                              strategy0.experimental_local_results(v_read)))
예제 #7
0
    def _create_tpu_strategy():
        global _did_connect_to_cluster

        try:
            # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
            # which case we fall back to the values passed as flags.
            resolver = tpu_cluster_resolver.TPUClusterResolver()
            did_automatically_resolve = True
        except ValueError:
            did_automatically_resolve = False

            # These flags will be defined by tpu_test_wrapper.py.
            resolver = tpu_cluster_resolver.TPUClusterResolver(
                tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
                zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
                project=hasattr(FLAGS, "project") and FLAGS.project or None,
            )

        # Only connect once per process, rather than per test method.
        if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
            if not _did_connect_to_cluster:
                remote.connect_to_cluster(resolver)
                _did_connect_to_cluster = True

        topology = tpu_strategy_util.initialize_tpu_system(resolver)
        device_assignment = None
        if use_single_core:
            device_assignment = device_assignment_lib.DeviceAssignment(
                topology,
                core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)

        # Steps per run is only supported in TF 1.x
        if tf2.enabled():
            return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
        else:
            return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
                                         device_assignment, **kwargs)
예제 #8
0
    def test_worker_devices_on_subset_cores(self):
        resolver = get_tpu_cluster_resolver()
        remote.connect_to_cluster(resolver)
        topology = tpu_strategy_util.initialize_tpu_system(resolver)

        # Strategy for the 1st core.
        device_assignment = device_assignment_lib.DeviceAssignment.build(
            topology, num_replicas=1)
        first_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment)

        # Strategy for the 2nd core.
        device_assignment2 = device_assignment_lib.DeviceAssignment(
            topology, [[[0, 0, 0, 1]]])
        second_core_strategy = tpu_lib.TPUStrategy(
            resolver, device_assignment=device_assignment2)

        self.assertLen(first_core_strategy.extended.worker_devices, 1)
        self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
                            "device:TPU:0")

        self.assertLen(second_core_strategy.extended.worker_devices, 1)
        self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
                            "device:TPU:1")
예제 #9
0
 def get_core_assignment(*core_ids):
   return device_assignment_lib.DeviceAssignment(get_topology(), [[get_topology().device_coordinates[0][i]] for i in core_ids])