Exemplo n.º 1
0
def strategy_and_input_combinations():
    return (combinations.times(
        combinations.combine(distribution=strategies_minus_tpu),
        combinations.combine(mode=['graph'],
                             use_numpy=[True, False],
                             use_validation_data=[True, False]) +
        combinations.combine(
            mode=['eager'], use_numpy=[False], use_validation_data=[False])) +
            combinations.times(
                combinations.combine(distribution=tpu_strategies),
                combinations.combine(mode=['graph'],
                                     use_numpy=[True, False],
                                     use_validation_data=[True, False])))
def strategy_and_input_combinations():
  return (
      combinations.times(
          combinations.combine(distribution=strategies_minus_tpu),
          combinations.combine(mode=['graph'],
                               use_numpy=[True, False],
                               use_validation_data=[True, False])
          + combinations.combine(mode=['eager'],
                                 use_numpy=[False],
                                 use_validation_data=[False])) +
      combinations.times(
          combinations.combine(distribution=tpu_strategies),
          combinations.combine(mode=['graph'],
                               use_numpy=[True, False],
                               use_validation_data=[True, False])))
Exemplo n.º 3
0
class TestDistributionStrategyWithNormalizationLayer(
    test.TestCase, parameterized.TestCase):

  @combinations.generate(combinations.times(
      all_strategy_combinations(),
      combinations.combine(fused=[True, False])))
  def test_batchnorm_correctness(self, distribution, fused):
    with self.cached_session():
      with distribution.scope():
        model = keras.models.Sequential()
        norm = keras.layers.BatchNormalization(
            input_shape=(10,), momentum=0.8, fused=fused)
        model.add(norm)
        model.compile(loss='mse',
                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))

      # centered on 5.0, variance 10.0
      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
      x = x.astype('float32')
      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
      dataset = dataset.repeat(100)
      dataset = batch_wrapper(dataset, 32, distribution)

      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
      predict_dataset = predict_dataset.repeat(100)
      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)

      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
      out = model.predict(predict_dataset, steps=2)
      out -= keras.backend.eval(norm.beta)
      out /= keras.backend.eval(norm.gamma)
      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
Exemplo n.º 4
0
class SingleLossStepTest(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=combinations.graph_and_eager_modes)))
    def testTrainNetwork(self, distribution, optimizer_fn):
        with distribution.scope():
            single_loss_step, layer = single_loss_example(optimizer_fn,
                                                          distribution,
                                                          use_bias=True)

            if context.executing_eagerly():
                run_step = single_loss_step
            else:
                with self.test_session() as sess:
                    run_step = sess.make_callable(single_loss_step())
            self.evaluate(variables.global_variables_initializer())

            weights, biases = [], []
            for _ in range(10):
                run_step()

                weights.append(self.evaluate(layer.kernel))
                biases.append(self.evaluate(layer.bias))

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)
Exemplo n.º 5
0
 def test_times_variable_arguments(self):
     c1 = combinations.combine(mode=["graph", "eager"])
     c2 = combinations.combine(optimizer=["adam", "gd"])
     c3 = combinations.combine(distribution=["d1", "d2"])
     c4 = combinations.times(c3, c1, c2)
     self.assertEqual([
         OrderedDict([("distribution", "d1"), ("mode", "graph"),
                      ("optimizer", "adam")]),
         OrderedDict([("distribution", "d1"), ("mode", "graph"),
                      ("optimizer", "gd")]),
         OrderedDict([("distribution", "d1"), ("mode", "eager"),
                      ("optimizer", "adam")]),
         OrderedDict([("distribution", "d1"), ("mode", "eager"),
                      ("optimizer", "gd")]),
         OrderedDict([("distribution", "d2"), ("mode", "graph"),
                      ("optimizer", "adam")]),
         OrderedDict([("distribution", "d2"), ("mode", "graph"),
                      ("optimizer", "gd")]),
         OrderedDict([("distribution", "d2"), ("mode", "eager"),
                      ("optimizer", "adam")]),
         OrderedDict([("distribution", "d2"), ("mode", "eager"),
                      ("optimizer", "gd")])
     ], c4)
     self.assertEqual(
         combinations.combine(mode=["graph", "eager"],
                              optimizer=["adam", "gd"],
                              distribution=["d1", "d2"]), c4)
def test_combinations_for_embedding_model():
  return (
      combinations.times(
          combinations.combine(distribution=
                               strategies_for_embedding_models()),
          (graph_mode_test_configuration() +
           eager_mode_test_configuration())))
Exemplo n.º 7
0
 def test_times_variable_arguments(self):
   c1 = combinations.combine(mode=["graph", "eager"])
   c2 = combinations.combine(optimizer=["adam", "gd"])
   c3 = combinations.combine(distribution=["d1", "d2"])
   c4 = combinations.times(c3, c1, c2)
   self.assertEqual([
       OrderedDict([("distribution", "d1"), ("mode", "graph"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d1"), ("mode", "graph"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d1"), ("mode", "eager"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d1"), ("mode", "eager"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d2"), ("mode", "graph"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d2"), ("mode", "graph"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d2"), ("mode", "eager"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d2"), ("mode", "eager"),
                    ("optimizer", "gd")])
   ], c4)
   self.assertEqual(
       combinations.combine(
           mode=["graph", "eager"],
           optimizer=["adam", "gd"],
           distribution=["d1", "d2"]), c4)
def test_combinations_for_embedding_model():
  return (
      combinations.times(
          combinations.combine(distribution=
                               strategies_for_embedding_models()),
          (graph_mode_test_configuration() +
           eager_mode_test_configuration())))
Exemplo n.º 9
0
def test_combinations_with_tpu_strategies():
    tpu_strategies = [
        combinations.tpu_strategy, combinations.tpu_strategy_one_step
    ]

    return (combinations.times(
        combinations.combine(distribution=tpu_strategies),
        graph_mode_test_configuration()))
def test_combinations_with_tpu_strategies():
  tpu_strategies = [combinations.tpu_strategy,
                    combinations.tpu_strategy_one_step]

  return (
      combinations.times(
          combinations.combine(distribution=tpu_strategies),
          graph_mode_test_configuration()))
def strategy_and_optimizer_combinations():
  return combinations.times(
      all_strategy_combinations(),
      combinations.combine(
          optimizer=[combinations.adagrad_optimizer_v1_fn,
                     combinations.adam_optimizer_v1_fn,
                     combinations.gradient_descent_optimizer_v1_fn,
                     combinations.rmsprop_optimizer_v1_fn]))
Exemplo n.º 12
0
def strategy_and_optimizer_combinations():
  return combinations.times(
      all_strategy_combinations(),
      combinations.combine(
          optimizer=[combinations.adagrad_optimizer_v1_fn,
                     combinations.adam_optimizer_v1_fn,
                     combinations.gradient_descent_optimizer_v1_fn,
                     combinations.rmsprop_optimizer_v1_fn]))
Exemplo n.º 13
0
def strategy_and_optimizer_combinations():
    # TODO(b/122372746): Uncomment optimizers after they pass tests.
    return combinations.times(
        all_strategy_combinations(),
        combinations.combine(optimizer=[
            combinations.adagrad_optimizer_v1_fn,
            # combinations.adagrad_optimizer_keras_v2_fn,
            combinations.adam_optimizer_v1_fn,
            combinations.adam_optimizer_keras_v2_fn,
            combinations.gradient_descent_optimizer_v1_fn,
            combinations.gradient_descent_optimizer_keras_v2_fn,
            combinations.rmsprop_optimizer_v1_fn,
            # combinations.rmsprop_optimizer_keras_v2_fn
        ]))
Exemplo n.º 14
0
class MonitorTest(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=combinations.graph_and_eager_modes)))
    def testTrainNetwork(self, distribution, optimizer_fn):
        with distribution.scope():
            single_loss_step, layer = single_loss_example(
                optimizer_fn, distribution)

            if context.executing_eagerly():
                monitor = monitor_lib.Monitor(single_loss_step, None)
            else:
                with self.test_session() as sess:
                    monitor = monitor_lib.Monitor(single_loss_step, sess)

            monitor.run_steps(1)

            self.assertEqual(1, len(layer.trainable_variables))
            mirrored_weight_variable = layer.trainable_variables[0]
            start_error = self.evaluate(
                distribution.fetch(mirrored_weight_variable))
            start_error = abs(numpy.array(start_error) - 1)

            monitor.run_steps(9)
            end_error = self.evaluate(
                distribution.fetch(mirrored_weight_variable))
            end_error = abs(numpy.array(end_error) - 1)
            self.assertGreaterEqual(start_error, end_error)

    def testPassingASessionInEager(self):
        distribution = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
        step_function, _ = single_loss_example(
            lambda: gradient_descent.GradientDescentOptimizer(0.2),
            distribution)

        with session.Session() as sess, context.eager_mode():
            with self.assertRaisesRegexp(ValueError, "Should not provide"):
                _ = monitor_lib.Monitor(step_function, sess)

    def testNotPassingASessionInGraph(self):
        distribution = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
        step_function, _ = single_loss_example(
            lambda: gradient_descent.GradientDescentOptimizer(0.2),
            distribution)

        with context.graph_mode(), ops.Graph().as_default():
            with self.assertRaisesRegexp(ValueError, "Should provide"):
                _ = monitor_lib.Monitor(step_function, session=None)
Exemplo n.º 15
0
class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v2_optimizers(),
            combinations.combine(mode=["graph"],
                                 use_callable_loss=[True, False]) +
            combinations.combine(mode=["eager"], use_callable_loss=[True])))
    def testTrainNetwork(self,
                         distribution,
                         optimizer_fn,
                         use_callable_loss=True):
        with distribution.scope():
            model_fn, dataset_fn, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=use_callable_loss)

            ds = distribution.distribute_dataset(dataset_fn)
            if context.executing_eagerly():
                iterator = ds.make_one_shot_iterator()
            else:
                iterator = ds.make_initializable_iterator()

            def run_step():
                return control_flow_ops.group(
                    distribution.unwrap(
                        distribution.call_for_each_replica(
                            model_fn,
                            iterator.get_next(),
                            run_concurrently=layer.built)))

            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    sess.run(iterator.initializer)
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables.global_variables_initializer())

            weights, biases = [], []
            for _ in range(10):
                run_step()

                weights.append(self.evaluate(layer.kernel))
                biases.append(self.evaluate(layer.bias))

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)
Exemplo n.º 16
0
def strategy_and_input_combinations():
    def cnn_model_with_batch_norm(**kwargs):
        return _create_cnn_model(with_batch_norm=True, **kwargs)

    return (combinations.times(
        combinations.combine(distribution=all_strategies),
        combinations.combine(mode=['graph', 'eager'],
                             use_numpy=[True, False],
                             use_validation_data=[True, False]),
        combinations.combine(model_with_data=[
            ModelWithData('dnn', _create_dnn_model, _dnn_training_data),
            ModelWithData('cnn', _create_cnn_model, _cnn_training_data),
            ModelWithData('cnn_batch_norm',
                          cnn_model_with_batch_norm,
                          _cnn_training_data,
                          with_batch_norm=True),
        ])))
Exemplo n.º 17
0
def strategy_and_input_combinations():
  def cnn_model_with_batch_norm(**kwargs):
    return _create_cnn_model(with_batch_norm=True, **kwargs)

  return (
      combinations.times(
          combinations.combine(distribution=all_strategies),
          combinations.combine(mode=['graph', 'eager'],
                               use_numpy=[True, False],
                               use_validation_data=[True, False]),
          combinations.combine(model_with_data=[
              ModelWithData('dnn', _create_dnn_model, _dnn_training_data),
              ModelWithData('cnn', _create_cnn_model, _cnn_training_data),
              ModelWithData('cnn_batch_norm',
                            cnn_model_with_batch_norm,
                            _cnn_training_data,
                            with_batch_norm=True),
          ])))
Exemplo n.º 18
0
 def test_times(self):
   c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
   c2 = combinations.combine(mode=["eager"], loss=["callable"])
   c3 = combinations.combine(distribution=["d1", "d2"])
   c4 = combinations.times(c3, c1 + c2)
   self.assertEqual([
       OrderedDict([("distribution", "d1"), ("loss", "callable"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d1"), ("loss", "tensor"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d1"), ("loss", "callable"),
                    ("mode", "eager")]),
       OrderedDict([("distribution", "d2"), ("loss", "callable"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d2"), ("loss", "tensor"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d2"), ("loss", "callable"),
                    ("mode", "eager")])
   ], c4)
Exemplo n.º 19
0
 def test_times(self):
     c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
     c2 = combinations.combine(mode=["eager"], loss=["callable"])
     c3 = combinations.combine(distribution=["d1", "d2"])
     c4 = combinations.times(c3, c1 + c2)
     self.assertEqual([
         OrderedDict([("distribution", "d1"), ("loss", "callable"),
                      ("mode", "graph")]),
         OrderedDict([("distribution", "d1"), ("loss", "tensor"),
                      ("mode", "graph")]),
         OrderedDict([("distribution", "d1"), ("loss", "callable"),
                      ("mode", "eager")]),
         OrderedDict([("distribution", "d2"), ("loss", "callable"),
                      ("mode", "graph")]),
         OrderedDict([("distribution", "d2"), ("loss", "tensor"),
                      ("mode", "graph")]),
         OrderedDict([("distribution", "d2"), ("loss", "callable"),
                      ("mode", "eager")])
     ], c4)
Exemplo n.º 20
0
class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
    def _get_iterator(self, ds):
        if context.executing_eagerly():
            iterator = ds.make_one_shot_iterator()
        else:
            iterator = ds.make_initializable_iterator()
            self.evaluate(iterator.initializer)
        return iterator

    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=["graph"],
                                 use_callable_loss=[True, False]) +
            combinations.combine(mode=["eager"], use_callable_loss=[True])) +
        combinations.combine(distribution=[combinations.tpu_strategy],
                             optimizer_fn=combinations.optimizers_v1,
                             mode=["graph"],
                             use_callable_loss=[True, False]))
    def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss):
        with distribution.scope():
            model_fn, dataset_fn, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=use_callable_loss)

            def step_fn(ctx, inputs):
                del ctx  # Unused
                return distribution.group(
                    distribution.call_for_each_replica(model_fn,
                                                       args=(inputs, )))

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                return distribution.run_steps_on_dataset(step_fn,
                                                         iterator,
                                                         iterations=2).run_op

            self.evaluate(distribution.initialize())
            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
            self.evaluate(variables_lib.global_variables_initializer())

            weights, biases = [], []
            for _ in range(5):
                run_step()

                weights.append(self.evaluate(layer.kernel))
                biases.append(self.evaluate(layer.bias))

            self.evaluate(distribution.finalize())

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)

    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=["graph"],
                                 use_callable_loss=[True, False]) +
            combinations.combine(mode=["eager"], use_callable_loss=[True])))
    def testTrainNetworkByCallForEachReplica(self, distribution, optimizer_fn,
                                             use_callable_loss):
        with distribution.scope():
            model_fn, dataset_fn, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=use_callable_loss)

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                return distribution.group(
                    distribution.call_for_each_replica(
                        model_fn, args=(iterator.get_next(), )))

            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            weights, biases = [], []
            for _ in range(10):
                run_step()

                weights.append(self.evaluate(layer.kernel))
                biases.append(self.evaluate(layer.bias))

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)

    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers() +
            combinations.distributions_and_v2_optimizers(),
            combinations.combine(mode=["graph", "eager"])) +
        combinations.combine(distribution=[combinations.tpu_strategy],
                             optimizer_fn=combinations.optimizers_v1 +
                             combinations.optimizers_v2,
                             mode=["graph"]))
    def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
        created_variables = []
        trainable_variables = []

        def appending_creator(next_creator, *args, **kwargs):
            v = next_creator(*args, **kwargs)
            created_variables.append(v.name)
            if "trainable" in kwargs and kwargs["trainable"]:
                trainable_variables.append(v.name)
            return v

        # Creator scope needs to be set before it's used inside
        # `distribution.scope`.
        with variable_scope.variable_creator_scope(
                appending_creator), distribution.scope():
            model_fn, dataset_fn, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=True,
                create_optimizer_inside_model_fn=True)

            def step_fn(ctx, inputs):
                del ctx  # Unused
                return distribution.group(
                    distribution.call_for_each_replica(model_fn,
                                                       args=(inputs, )))

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                return distribution.run_steps_on_dataset(step_fn,
                                                         iterator,
                                                         iterations=1).run_op

            self.evaluate(distribution.initialize())
            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
            self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            self.evaluate(distribution.finalize())

            def get_expected_variables(optimizer_fn, num_parameter_devices):
                variables_map = {
                    "GradientDescent": ["dense/kernel", "dense/bias"],
                    "Adagrad": [
                        "dense/kernel/Adagrad", "dense/kernel",
                        "dense/bias/Adagrad", "dense/bias"
                    ]
                }
                variables = variables_map[optimizer_fn().get_name()]
                variables.extend([
                    v + "/replica_{}".format(replica) for v in variables
                    for replica in range(1, num_parameter_devices)
                ])
                return set([v + ":0" for v in variables])

            self.assertEqual(
                get_expected_variables(optimizer_fn,
                                       len(distribution.parameter_devices)),
                set(created_variables))

    @combinations.generate(
        combinations.times(
            combinations.combine(momentum=[0.8, 0.9, 0.99],
                                 renorm=[False, True]),
            combinations.times(
                combinations.distributions_and_v1_optimizers(),
                combinations.combine(
                    mode=["graph", "eager"],
                    # TODO(isaprykin):  Allow False here.  Currently subsequent
                    # replicas will re-execute UPDATE_OPS of previous replicas.
                    update_ops_in_cross_replica_mode=[True])) +
            combinations.combine(distribution=[combinations.tpu_strategy],
                                 optimizer_fn=combinations.optimizers_v1,
                                 mode=["graph"],
                                 update_ops_in_cross_replica_mode=[False])))
    def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn,
                                      momentum, renorm,
                                      update_ops_in_cross_replica_mode):
        """Verifies that moving mean updates are reduced across replicas."""
        with distribution.scope():
            num_replicas = distribution.num_replicas_in_sync
            model_fn, dataset_fn, batchnorm = batchnorm_example(
                optimizer_fn,
                batch_per_epoch=num_replicas,
                momentum=momentum,
                renorm=renorm,
                update_ops_in_replica_mode=not update_ops_in_cross_replica_mode
            )

            def step_fn(ctx, inputs):
                del ctx  # Unused
                fetches = distribution.unwrap(
                    distribution.call_for_each_replica(model_fn,
                                                       args=(inputs, )))
                if update_ops_in_cross_replica_mode:
                    fetches += tuple(
                        ops.get_collection(ops.GraphKeys.UPDATE_OPS))
                return control_flow_ops.group(fetches)

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                return distribution.run_steps_on_dataset(step_fn,
                                                         iterator,
                                                         iterations=1).run_op

            self.evaluate(distribution.initialize())
            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
            self.evaluate(variables_lib.global_variables_initializer())

            expected_moving_means = [0.] * 8

            def averaged_batch_mean(i):
                # Each batch has shape [16, 8] where the ith element in jth list is
                # (8 * j + i + replica_id * 100). So the batch mean in each replica is
                # (60 + i + replica_id * 100). So here comes its batch mean over all
                # replicas:
                return 60. + i + (num_replicas - 1.) / 2. * 100.

            for _ in range(10):
                run_step()
                moving_means = self.evaluate(batchnorm.moving_mean)

                # We make sure that the moving_mean is updated as if the sample mean is
                # calculated over all replicas.
                for i, expected_moving_mean in enumerate(
                        expected_moving_means):
                    expected_moving_means[i] -= (
                        (expected_moving_mean - averaged_batch_mean(i)) *
                        (1.0 - momentum))
                    self.assertNear(expected_moving_means[i], moving_means[i],
                                    0.0001)

            self.evaluate(distribution.finalize())

    @combinations.generate(
        combinations.times(
            combinations.combine(
                optimizer_fn=[
                    combinations.gradient_descent_optimizer_v1_fn,
                    combinations.gradient_descent_optimizer_v2_fn
                ],
                loss_reduction=[
                    losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
                    losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
                    losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS
                ]),
            combinations.times(
                combinations.combine(distribution=[
                    combinations.one_device_strategy,
                    combinations.mirrored_strategy_with_gpu_and_cpu,
                    combinations.mirrored_strategy_with_two_gpus,
                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
                    combinations.core_mirrored_strategy_with_two_gpus
                ]),
                combinations.combine(mode=["graph"],
                                     use_callable_loss=[True, False]) +
                combinations.combine(mode=["eager"], use_callable_loss=[True]))
            + combinations.combine(distribution=[combinations.tpu_strategy],
                                   mode=["graph"],
                                   use_callable_loss=[True, False])))
    def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
                      use_callable_loss):
        with distribution.scope():
            all_vars = []

            def model_fn(inputs):
                x, y = inputs

                def loss_fn():
                    # Use fixed initialization to make the steps deterministic.
                    w = variable_scope.get_variable("w", initializer=[[2.]])
                    all_vars.append(w)
                    predict = math_ops.matmul(x, w)
                    return losses_impl.mean_squared_error(
                        y, predict, reduction=loss_reduction)

                optimizer = optimizer_fn(
                )  # GradientDescent with 0.2 learning rate

                if use_callable_loss:
                    return optimizer.minimize(loss_fn)
                else:
                    return optimizer.minimize(loss_fn())

            def dataset_fn():
                features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
                labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
                return dataset_ops.Dataset.zip((features, labels)).repeat()

            def step_fn(ctx, inputs):
                del ctx  # Unused
                return distribution.group(
                    distribution.call_for_each_replica(model_fn,
                                                       args=(inputs, )))

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                return distribution.run_steps_on_dataset(step_fn,
                                                         iterator,
                                                         iterations=1).run_op

            self.evaluate(distribution.initialize())
            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
            self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            v = all_vars[0]
            self.assertTrue(all(v is vi for vi in all_vars[1:]))
            weight = numpy.squeeze(self.evaluate(v))
            # Our model is:
            #   predict = x * w
            #   loss = (predict - y)^2
            #   dloss/dpredict = 2*(predict - y)
            #   dloss/dw = 2 * x^T @ (predict - y)
            # For our batch size of 2, assuming sum loss reduction:
            #   x = [2, 7]
            #   y = [6, 21]
            #   w_initial = 2
            #   predict = [4, 14]
            #   predict - y = [-2, -7]
            #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
            # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
            # with sum loss reduction, or 10.6 with mean.
            if loss_reduction == losses_impl.Reduction.SUM:
                # Note that the "distribution.num_replicas_in_sync" factor will go away
                # once we split the input across replicas, instead of pulling a complete
                # batch of input per replica.
                self.assertNear(weight,
                                2 + 21.2 * distribution.num_replicas_in_sync,
                                0.0001)
            else:
                # One of the mean loss reductions.
                self.assertNear(weight, 2 + 10.6, 0.0001)

            self.evaluate(distribution.finalize())

    @combinations.generate(
        combinations.times(combinations.distributions_and_v1_optimizers(),
                           combinations.combine(mode=["graph", "eager"]),
                           combinations.combine(is_tpu=[False])) +
        combinations.combine(distribution=[combinations.tpu_strategy],
                             optimizer_fn=combinations.optimizers_v1,
                             mode=["graph"],
                             is_tpu=[True]))
    def testRunStepsWithOutputContext(self, distribution, optimizer_fn,
                                      is_tpu):
        with distribution.scope():

            def dataset_fn():
                dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
                # TODO(priyag): batch with drop_remainder=True causes shapes to be
                # fully defined for TPU. Remove this when XLA supports dynamic shapes.
                return dataset.batch(batch_size=1, drop_remainder=True)

            optimizer = optimizer_fn()
            layer = core.Dense(1, use_bias=True)

            key1 = "foo"
            value1 = "bar"

            def model_fn(output_context, x):
                """A very simple model written by the user."""
                def loss_fn():
                    y = array_ops.reshape(layer(x),
                                          []) - constant_op.constant(1.)
                    return y * y

                train_op = optimizer.minimize(loss_fn)
                loss = loss_fn()
                output_context.set_last_step_output(
                    name="replica_loss_reduced",
                    output=loss,
                    reduce_op=reduce_util.ReduceOp.MEAN)
                output_context.set_non_tensor_output(key1, value1)
                return (train_op, loss)

            def step_fn(output_context, inputs):
                (train_op, loss) = distribution.call_for_each_replica(
                    model_fn, args=(output_context, inputs))
                output_context.set_last_step_output(
                    name="cross_replica_loss_reduced",
                    output=loss,
                    reduce_op=reduce_util.ReduceOp.MEAN)
                output_context.set_last_step_output(
                    name="cross_replica_loss_not_reduced", output=loss)
                return distribution.group(train_op)

            iterator = self._get_iterator(
                distribution.distribute_dataset(dataset_fn))

            def run_step():
                initial_loss = lambda: constant_op.constant(1e7)
                # Initial values corresponding to reduced losses are just single
                # tensors. But for non reduced losses, we need to have initial
                # values that are of the same structure as non reduced losses. In
                # MirroredStrategy, this will be a list of losses, in TPUStrategy
                # it will be single tensor. Using `broadcast` followed by `unwrap`
                # gives us the desired initial value structure.
                initial_loop_values = {
                    "replica_loss_reduced":
                    initial_loss(),
                    "cross_replica_loss_reduced":
                    initial_loss(),
                    "cross_replica_loss_not_reduced":
                    distribution.unwrap(distribution.broadcast(initial_loss()))
                }
                ctx = distribution.run_steps_on_dataset(
                    step_fn,
                    iterator,
                    iterations=2,
                    initial_loop_values=initial_loop_values)

                self.assertEqual({key1: (value1, )}, ctx.non_tensor_outputs)
                self._verify_loss_output(
                    initial_loss(),
                    loss_output=ctx.last_step_outputs["replica_loss_reduced"],
                    reduced=True,
                    distribution=distribution)
                self._verify_loss_output(
                    initial_loss(),
                    loss_output=ctx.
                    last_step_outputs["cross_replica_loss_reduced"],
                    reduced=True,
                    distribution=distribution)
                self._verify_loss_output(
                    initial_loss(),
                    loss_output=ctx.
                    last_step_outputs["cross_replica_loss_not_reduced"],
                    reduced=False,
                    distribution=distribution)
                return (ctx.run_op,
                        ctx.last_step_outputs["replica_loss_reduced"])

            self.evaluate(distribution.initialize())
            if not context.executing_eagerly():
                with self.cached_session() as sess:
                    run_step = sess.make_callable(run_step())
            self.evaluate(variables_lib.global_variables_initializer())

            weights, biases, losses = [], [], []
            for _ in range(5):
                _, loss = run_step()
                losses.append(loss)
                weights.append(self.evaluate(layer.kernel))
                biases.append(self.evaluate(layer.bias))

            self.evaluate(distribution.finalize())

            loss_is_not_increasing = all(y <= x
                                         for x, y in zip(losses, losses[1:]))
            self.assertTrue(loss_is_not_increasing)

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            error_is_not_increasing = all(y <= x
                                          for x, y in zip(error, error[1:]))
            self.assertTrue(error_is_not_increasing)

    def _verify_loss_output(self, initial_loss, loss_output, reduced,
                            distribution):
        if not reduced:
            self.assertLen(distribution.unwrap(loss_output),
                           distribution.num_replicas_in_sync)
            loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN,
                                              loss_output)
        else:
            unwrapped_output = distribution.unwrap(loss_output)
            self.assertLen(unwrapped_output, 1)
            loss_tensor = unwrapped_output[0]
        self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
        self.assertEqual(initial_loss.shape, loss_tensor.shape)
Exemplo n.º 21
0
class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=["graph"],
                                 use_callable_loss=[True, False]) +
            combinations.combine(mode=["eager"], use_callable_loss=[True])))
    def testTrainNetwork(self,
                         distribution,
                         optimizer_fn,
                         use_callable_loss=True):
        with distribution.scope():
            model_fn, dataset, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=use_callable_loss)

            iterator = distribution.distribute_dataset(dataset)

            def run_step():
                return distribution.group(
                    distribution.call_for_each_tower(
                        model_fn,
                        iterator.get_next(),
                        run_concurrently=layer.built))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            weights, biases = [], []
            for _ in range(10):
                run_step()

                weights.append(self.evaluate(distribution.fetch(layer.kernel)))
                biases.append(self.evaluate(distribution.fetch(layer.bias)))

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)

    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers() +
            combinations.distributions_and_v2_optimizers(),
            combinations.combine(mode=["graph", "eager"])))
    def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
        created_variables = []
        trainable_variables = []

        def appending_creator(next_creator, *args, **kwargs):
            v = next_creator(*args, **kwargs)
            created_variables.append(v.name)
            if "trainable" in kwargs and kwargs["trainable"]:
                trainable_variables.append(v.name)
            return v

        # Creator scope needs to be set before it's used inside
        # `distribution.scope`.
        with variable_scope.variable_creator_scope(
                appending_creator), distribution.scope():
            model_fn, dataset, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=True,
                create_optimizer_inside_model_fn=True)

            iterator = distribution.distribute_dataset(dataset)

            def run_step():
                return distribution.group(
                    distribution.call_for_each_tower(
                        model_fn,
                        iterator.get_next(),
                        run_concurrently=layer.built))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            def get_expected_variables(optimizer_fn, num_parameter_devices):
                variables_map = {
                    "GradientDescent": ["dense/kernel", "dense/bias"],
                    "Adam": [
                        "dense/kernel", "dense/bias", "beta1_power",
                        "beta2_power", "dense/kernel/Adam",
                        "dense/kernel/Adam_1", "dense/bias/Adam",
                        "dense/bias/Adam_1"
                    ]
                }
                variables = variables_map[optimizer_fn().get_name()]
                variables.extend([
                    v + "/replica_{}".format(replica) for v in variables
                    for replica in range(1, num_parameter_devices)
                ])
                return set([v + ":0" for v in variables])

            self.assertEqual(
                get_expected_variables(optimizer_fn,
                                       len(distribution.parameter_devices)),
                set(created_variables))

    @combinations.generate(
        combinations.times(
            combinations.distributions_and_v1_optimizers(),
            combinations.combine(mode=["graph", "eager"],
                                 momentum=[0.8, 0.9, 0.99],
                                 renorm=[False, True])))
    def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn,
                                      momentum, renorm):
        """Verifies that moving mean updates are reduced across towers."""
        with distribution.scope():
            num_towers = len(distribution.worker_devices)
            model_fn, dataset, batchnorm = batchnorm_example(
                optimizer_fn,
                batch_per_epoch=num_towers,
                momentum=momentum,
                renorm=renorm)

            # Disable prefetching since that makes the specific input on each device
            # to be non deterministic, and this test relies on specific input being
            # on each device.
            if isinstance(distribution, mirrored_strategy.MirroredStrategy):
                distribution._prefetch_on_device = False
            iterator = distribution.distribute_dataset(dataset)

            def run_step():
                return control_flow_ops.group(
                    distribution.unwrap(
                        distribution.call_for_each_tower(
                            model_fn,
                            iterator.get_next(),
                            run_concurrently=batchnorm.built)) +
                    ops.get_collection(ops.GraphKeys.UPDATE_OPS))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            expected_moving_means = [0.] * 8

            def averaged_batch_mean(i):
                # Each batch has shape [16, 8] where the ith element in jth list is
                # (8 * j + i + tower_id * 100). So the batch mean in each tower is
                # (60 + i + tower_id * 100). So here comes its batch mean over all
                # towers:
                return 60. + i + (num_towers - 1.) / 2. * 100.

            for _ in range(10):
                run_step()
                moving_means = self.evaluate(
                    distribution.fetch(batchnorm.moving_mean))

                # We make sure that the moving_mean is updated as if the sample mean is
                # calculated over all towers.
                for i, expected_moving_mean in enumerate(
                        expected_moving_means):
                    expected_moving_means[i] -= (
                        (expected_moving_mean - averaged_batch_mean(i)) *
                        (1.0 - momentum))
                    self.assertNear(expected_moving_means[i], moving_means[i],
                                    0.0001)

    @combinations.generate(
        combinations.times(
            combinations.combine(
                distribution=[
                    combinations.one_device_strategy,
                    combinations.mirrored_strategy_with_gpu_and_cpu,
                    combinations.mirrored_strategy_with_two_gpus
                ],
                optimizer_fn=[
                    combinations.gradient_descent_optimizer_v1_fn,
                    combinations.gradient_descent_optimizer_v2_fn
                ],
                loss_reduction=[
                    losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
                    losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
                    losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS
                ]),
            combinations.combine(mode=["graph"],
                                 use_callable_loss=[True, False]) +
            combinations.combine(mode=["eager"], use_callable_loss=[True])))
    def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
                      use_callable_loss):
        with distribution.scope():
            all_vars = []

            def model_fn(x, y):
                def loss_fn():
                    # Use fixed initialization to make the steps deterministic.
                    w = variable_scope.get_variable("w", initializer=[[2.]])
                    all_vars.append(w)
                    predict = math_ops.matmul(x, w)
                    return losses_impl.mean_squared_error(
                        y, predict, reduction=loss_reduction)

                optimizer = optimizer_fn(
                )  # GradientDescent with 0.2 learning rate

                if use_callable_loss:
                    return optimizer.minimize(loss_fn)
                else:
                    return optimizer.minimize(loss_fn())

            features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
            labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
            dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
            iterator = distribution.distribute_dataset(dataset)

            def run_step():
                return distribution.group(
                    distribution.call_for_each_tower(model_fn,
                                                     *iterator.get_next(),
                                                     run_concurrently=False))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            self.assertEqual(distribution.num_towers, len(all_vars))
            v = all_vars[0]
            self.assertTrue(all([v is vi for vi in all_vars[1:]]))
            weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
            # Our model is:
            #   predict = x * w
            #   loss = (predict - y)^2
            #   dloss/dpredict = 2*(predict - y)
            #   dloss/dw = 2 * x^T @ (predict - y)
            # For our batch size of 2, assuming sum loss reduction:
            #   x = [2, 7]
            #   y = [6, 21]
            #   w_initial = 2
            #   predict = [4, 14]
            #   predict - y = [-2, -7]
            #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
            # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
            # with sum loss reduction, or 10.6 with mean.
            if loss_reduction == losses_impl.Reduction.SUM:
                # Note that the "distribution.num_towers" factor will go away once
                # we split the input across towers, instead of pulling a complete
                # batch of input per tower.
                self.assertNear(weight, 2 + 21.2 * distribution.num_towers,
                                0.0001)
            else:
                # One of the mean loss reductions.
                self.assertNear(weight, 2 + 10.6, 0.0001)
Exemplo n.º 22
0
 def test_overlapping_keys(self):
   c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
   c2 = combinations.combine(mode=["eager"], loss=["callable"])
   with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
     _ = combinations.times(c1, c2)
def all_strategy_and_input_config_combinations():
  return (
      combinations.times(
          combinations.combine(distribution=all_strategies),
          eager_mode_test_configuration() + graph_mode_test_configuration()))
def all_strategy_and_input_config_combinations():
    return (combinations.times(
        combinations.combine(distribution=all_strategies),
        combinations.combine(mode=['graph', 'eager'],
                             use_numpy=[True, False],
                             use_validation_data=[True, False])))
Exemplo n.º 25
0
 def test_overlapping_keys(self):
     c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
     c2 = combinations.combine(mode=["eager"], loss=["callable"])
     with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
         _ = combinations.times(c1, c2)
Exemplo n.º 26
0
def all_strategy_and_input_config_combinations():
    return (combinations.times(
        combinations.combine(distribution=all_strategies),
        eager_mode_test_configuration() + graph_mode_test_configuration()))