示例#1
0
 def test_wide_deep_model_backprop(self):
     with self.cached_session():
         linear_model = linear.LinearModel(units=1,
                                           kernel_initializer='zeros')
         dnn_model = sequential.Sequential(
             [core.Dense(units=1, kernel_initializer='zeros')])
         wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
         linear_inp = np.array([1.])
         dnn_inp = np.array([1.])
         inputs = [linear_inp, dnn_inp]
         output = linear_inp + 2 * dnn_inp
         linear_opt = gradient_descent.SGD(learning_rate=.1)
         dnn_opt = gradient_descent.SGD(learning_rate=.3)
         wide_deep_model.compile(
             optimizer=[linear_opt, dnn_opt],
             loss='mse',
             metrics=[],
             run_eagerly=testing_utils.should_run_eagerly())
         self.evaluate(tf.compat.v1.global_variables_initializer())
         wide_deep_model.fit(inputs, output, epochs=1)
         self.assertAllClose(
             [[0.6]],
             self.evaluate(
                 wide_deep_model.linear_model.dense_layers[0].kernel))
         self.assertAllClose(
             [[1.8]],
             self.evaluate(wide_deep_model.dnn_model.layers[0].kernel))
 def test_error_if_policy_is_set(self):
   with policy.policy_scope('mixed_float16'):
     with self.assertRaisesRegex(ValueError,
                                 'the global Keras dtype Policy has been set'):
       enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
   # Test no error is thrown when the policy is currently the default.
   enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
   # Test no error is thrown when the policy is a non-mixed policy.
   with policy.policy_scope('float64'):
     enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
示例#3
0
    def testConstructMomentumWithLR(self):
        opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
        opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
        opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
        self.assertIsInstance(opt.lr, tf.Variable)
        self.assertIsInstance(opt_2.lr, tf.Variable)
        self.assertIsInstance(opt_3.lr, tf.Variable)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(self.evaluate(opt.lr), (1.0))
        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
    def test_custom_aggregation(self, distribution,
                                experimental_aggregate_gradients, expected):

        with distribution.scope():
            v = tf.Variable([0., 0.])
            optimizer = gradient_descent.SGD(0.1)

        @tf.function
        def optimize():
            with tf.compat.v1.device(distribution.extended.worker_devices[0]):
                v1 = tf.convert_to_tensor([1., 1.])
            with tf.compat.v1.device(distribution.extended.worker_devices[1]):
                v2 = tf.convert_to_tensor([2., 2.])
            grads = values.PerReplica([v1, v2])

            def step_fn(grads):
                optimizer.apply_gradients([(grads, v)],
                                          experimental_aggregate_gradients=
                                          experimental_aggregate_gradients)
                return v.read_value()

            return distribution.experimental_local_results(
                distribution.run(step_fn, args=(grads, )))

        self.assertAllClose(optimize(), expected)
示例#5
0
    def test_dataset_creator_usage_in_parameter_server_model_fit(self):
        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=2, num_ps=1, rpc_layer="grpc")
        cluster_def["chief"] = [
            "localhost:%d" % multi_worker_test_base.pick_unused_port()
        ]
        strategy = tf.distribute.experimental.ParameterServerStrategy(
            SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
        with strategy.scope():
            model = sequential.Sequential([core_layers.Dense(10)])
        model.compile(gradient_descent.SGD(), loss="mse")

        def dataset_fn(input_context):
            global_batch_size = 64
            batch_size = input_context.get_per_replica_batch_size(
                global_batch_size)
            dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat()
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)
            dataset = dataset.batch(batch_size)
            dataset = dataset.prefetch(2)
            return dataset

        history = model.fit(dataset_creator.DatasetCreator(dataset_fn),
                            epochs=10,
                            steps_per_epoch=10,
                            verbose=0)
        self.assertLen(history.history["loss"], 10)
示例#6
0
 def test_linear_model(self, distribution, use_dataset_creator, data_fn):
     if ((not use_dataset_creator) and isinstance(
             distribution,
             tf.distribute.experimental.ParameterServerStrategy)):
         self.skipTest(
             'Parameter Server strategy requires dataset creator to be used in '
             'model.fit.')
     if (not tf.__internal__.tf2.enabled() and use_dataset_creator
             and isinstance(
                 distribution,
                 tf.distribute.experimental.ParameterServerStrategy)):
         self.skipTest(
             'Parameter Server strategy with dataset creator needs to be run when '
             'eager execution is enabled.')
     with distribution.scope():
         model = linear.LinearModel()
         opt = gradient_descent.SGD(learning_rate=0.1)
         model.compile(opt, 'mse')
         if use_dataset_creator:
             x = dataset_creator.DatasetCreator(dataset_fn)
             hist = model.fit(x, epochs=5, steps_per_epoch=INPUT_SIZE)
         else:
             if data_fn == 'numpy':
                 inputs, output = get_numpy()
                 hist = model.fit(inputs, output, epochs=5)
             else:
                 hist = model.fit(get_dataset(), epochs=5)
             self.assertLess(hist.history['loss'][4], 0.2)
 def testUnsupportedStrategy(self):
     strategy = tf.distribute.experimental.CentralStorageStrategy()
     expected_error = (
         'Loss scaling is not supported with the tf.distribute.Strategy: '
         'CentralStorageStrategy. Try using a different Strategy, e.g. a '
         'MirroredStrategy')
     with strategy.scope(), self.assertRaisesRegex(ValueError,
                                                   expected_error):
         loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     opt = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
     with strategy.scope():
         var = tf.Variable(1.0)
         loss = lambda: var * 2.0
         run_fn = lambda: opt.minimize(loss, [var])
         with self.assertRaisesRegex(ValueError, expected_error):
             strategy.experimental_run(run_fn)
示例#8
0
    def testMinimizeSparseResourceVariable(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in [tf.half, tf.float32, tf.float64]:
                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
                var1 = tf.Variable([3.0], dtype=dtype)
                x = tf.constant([[4.0], [5.0]], dtype=dtype)

                def loss():
                    pred = tf.matmul(
                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
                    pred += var1  # pylint: disable=cell-var-from-loop
                    return pred * pred

                sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
                self.evaluate(tf.compat.v1.global_variables_initializer())
                # Run 1 step of sgd
                self.evaluate(sgd_op)
                # Validate updated params
                np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
                np_grad = 2 * np_pred
                self.assertAllCloseAccordingToType(
                    [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]],
                    self.evaluate(var0))
                self.assertAllCloseAccordingToType([3.0 - np_grad],
                                                   self.evaluate(var1))
示例#9
0
    def testSharing(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in [tf.half, tf.float32, tf.float64]:
                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
                grads0 = tf.constant([0.1, 0.1], dtype=dtype)
                grads1 = tf.constant([0.01, 0.01], dtype=dtype)
                mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
                mom_update1 = mom_opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                mom_update2 = mom_opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

                slot0 = mom_opt.get_slot(var0, "momentum")
                self.assertEqual(slot0.shape, var0.shape)
                slot1 = mom_opt.get_slot(var1, "momentum")
                self.assertEqual(slot1.shape, var1.shape)

                # Fetch params to validate initial values
                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
                # Step 1: the momentum accumulators where 0. So we should see a normal
                # update: v -= grad * learning_rate
                self.evaluate(mom_update1)
                # Check that the momentum accumulators have been updated.
                self.assertAllCloseAccordingToType(np.array([-0.2, -0.2]),
                                                   self.evaluate(slot0))
                self.assertAllCloseAccordingToType(np.array([-0.02, -0.02]),
                                                   self.evaluate(slot1))
                # Check that the parameters have been updated.
                self.assertAllCloseAccordingToType(
                    np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
                    self.evaluate(var0))
                self.assertAllCloseAccordingToType(
                    np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
                    self.evaluate(var1))
                # Step 2: the second momentum accumulators contain the previous update.
                self.evaluate(mom_update2)
                # Check that the momentum accumulators have been updated.
                self.assertAllCloseAccordingToType(
                    np.array([(0.9 * (-0.2) - 2.0 * 0.1),
                              (0.9 * (-0.2) - 2.0 * 0.1)]),
                    self.evaluate(slot0))
                self.assertAllCloseAccordingToType(
                    np.array([(0.9 * (-0.02) - 2.0 * 0.01),
                              (0.9 * (-0.02) - 2.0 * 0.01)]),
                    self.evaluate(slot1))
                # Check that the parameters have been updated.
                self.assertAllCloseAccordingToType(
                    np.array([
                        1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                        2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
                    ]), self.evaluate(var0))
                self.assertAllCloseAccordingToType(
                    np.array([
                        2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                        3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
                    ]), self.evaluate(var1))
    def testDynamicLossScaleWithSlots(self, strategy_fn):
        strategy_obj = strategy_fn()
        if (isinstance(strategy_obj, tf.distribute.MirroredStrategy)
                and tf.compat.v1.control_flow_v2_enabled()
                and not tf.executing_eagerly()):
            self.skipTest('b/138667997')
        with strategy_obj.scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            # An SGD optimizer with momentum has slot variables.
            opt = gradient_descent.SGD(1.0, momentum=1.)
            initial_scale = 2.
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=initial_scale, dynamic_growth_steps=1)
            loss = lambda: var / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator starts at 0 and the gradient is 1. The
            # accumulator is incremented by the gradient, so it is now 1. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 1.
            self.assertAllClose([0.0, 1.0], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)

            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # The momentum accumulator was 1 before this step and the gradient is 1.
            # The accumulator is incremented by the gradient, so it is now 2. Then the
            # variable is subtracted by the accumulator, so the variable is subtracted
            # by 2.
            self.assertAllClose([-2., -1.], self.evaluate(var))
            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)

            self.assertEqual(opt.get_slot_names(), ['momentum'])
    def testPassingV1LossScaleErrors(self):
        opt = gradient_descent.SGD()
        loss_scale = tf.mixed_precision.experimental.DynamicLossScale(
            multiplier=4)
        with self.assertRaisesRegex(
                ValueError, 'When passing a DynamicLossScale to "loss_scale", '
                'DynamicLossScale.multiplier must be 2. Got: '
                'DynamicLossScale'):
            loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)

        class MyLossScale(tf.mixed_precision.experimental.LossScale):
            def __call__(self):
                return 1.

            def update(self, grads):
                return None, True

            def get_config(self):
                return {}

        with self.assertRaisesRegex(
                TypeError,
                'Passing a LossScale that is not a FixedLossScale or a '
                'DynamicLossScale is no longer supported. Got:'):
            loss_scale_optimizer.LossScaleOptimizerV1(opt, MyLossScale())
    def testDynamicUpdate(self, strategy_fn):
        with strategy_fn().scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)

            # Test optimizer with finite gradients
            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Gradient is 2, so variable will have 2 subtracted from it
            self.assertAllClose([-1.0, 0.0], self.evaluate(var))
            # Loss scale has doubled from 2 to 4
            self.assertEqual(4., self.evaluate(opt.loss_scale))

            # Test optimizer with NaN gradients
            loss = lambda: var * float('NaN')
            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(2., self.evaluate(opt.loss_scale))
    def testNanOnOneReplicaOnly(self):
        if not tf.test.is_gpu_available():
            self.skipTest('Test requires GPU')
        if (not tf.executing_eagerly()
                and not tf.compat.v1.control_flow_v2_enabled()):
            self.skipTest(
                'b/181283011: GradientTape does not work properly with '
                'V1 control flow, and opt.minimize uses GradientTape')
        with create_mirrored_strategy().scope() as strategy:
            var = tf.Variable([1.0, 2.0])
            opt = gradient_descent.SGD(1.0)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=2)

            def loss():
                rep_id = (tf.distribute.get_replica_context().
                          replica_id_in_sync_group)
                # The last element of last replica's gradient is NaN.
                return tf.compat.v1.cond(
                    tf.constant(rep_id == 0), lambda: var * 2.,
                    lambda: var * tf.constant([1., float('NaN')]))

            run_fn = lambda: opt.minimize(loss, var_list=[var])
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # Variable should not change from before, due to NaN gradients.
            self.assertAllClose(self.evaluate(var), [1.0, 2.0])
            # Loss scale should half due to NaN gradients.
            self.assertEqual(1., self.evaluate(opt.loss_scale))
 def testDynamicLossScaleDefaultValues(self):
     opt = gradient_descent.SGD()
     opt = loss_scale_optimizer.LossScaleOptimizer(opt)
     self.assertEqual(opt.initial_scale, 2**15)
     self.assertEqual(opt.dynamic_growth_steps, 2000)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
    def testDynamicLossScale(self, strategy_fn):
        strategy = strategy_fn()
        learning_rate = 2.
        expected_gradient = tf.Variable(learning_rate /
                                        strategy.num_replicas_in_sync)
        with strategy.scope():
            var = tf.Variable([5.0])
            opt = gradient_descent.SGD(learning_rate)
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2, dynamic_growth_steps=1)
            self.assertEqual(opt.initial_scale, 2.)
            self.assertIsInstance(opt.initial_scale, float)
            self.assertEqual(opt.dynamic_growth_steps, 1)
            self.assertIsInstance(opt.dynamic_growth_steps, int)

            self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync,
                             0)
            run_fn = self._run_fn_with_grad_check(strategy, var, opt,
                                                  expected_gradient)
            run_op = strategy.experimental_run(run_fn)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            self._run_if_in_graph_mode(run_op)
            # The loss is the identity of the variable. Therefore the gradient is 1,
            # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
            self.assertAllClose([3.], self.evaluate(var))

            # Loss scale will be double, so the expected gradient is also doubled.
            self.evaluate(
                expected_gradient.assign(2 * learning_rate /
                                         strategy.num_replicas_in_sync))
            run_op = strategy.experimental_run(run_fn)
            self._run_if_in_graph_mode(run_op)
            # As before, the 2 is subtracted from the variable, making it's new value
            # 1.
            self.assertAllClose([1.], self.evaluate(var))
 def testDynamicMustBeBool(self):
     opt = gradient_descent.SGD()
     with self.assertRaisesRegex(
             TypeError,
             '"dynamic" argument to LossScaleOptimizer.__init__ must be '
             "a bool, but got: 'dynamic'"):
         loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
示例#17
0
  def test_gradient(self, strategy_fn):
    x = tf.constant([1.])
    with strategy_fn().scope() as strategy:
      with policy.policy_scope('mixed_float16'):
        layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
        # Learning rate is small enough that if applied to a float16 variable,
        # the variable will not change. So this tests the learning rate is not
        # applied to a float16 value, but instead the float32 variable.
        opt = gradient_descent.SGD(2**-14)

        def run_fn():
          with tf.GradientTape() as tape:
            y = layer(x)
            # Divide by num_replicas_in_sync, as the effective total loss is the
            # sum of each of the replica's losses.
            y /= strategy.num_replicas_in_sync

          grad = tape.gradient(y, layer.v)
          return opt.apply_gradients([(grad, layer.v)])

        op = strategy.experimental_run(run_fn)
        if not tf.executing_eagerly():
          self.evaluate(tf.compat.v1.global_variables_initializer())
          self.evaluate(op)
        # The gradient with respective to the variable is 1. Since the
        # variable is initialized with 1 and the learning rate is 2**-14, the
        # new variable value should be: init_val - gradient * learning_rate,
        # which is  1 - 1 * 2**-14
        self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
    def testSerializationWithBuiltInOptimizer(self, use_v1):
        opt = gradient_descent.SGD(2., momentum=0.5)
        if use_v1:
            loss_scale = tf.mixed_precision.experimental.DynamicLossScale(
                initial_loss_scale=2., increment_period=3.)
            opt = loss_scale_optimizer.LossScaleOptimizerV1(opt, loss_scale)
        else:
            opt = loss_scale_optimizer.LossScaleOptimizer(
                opt, initial_scale=2., dynamic_growth_steps=3.)
        config = optimizers.serialize(opt)
        opt = optimizers.deserialize(config)
        # Force hyperparameters to be created
        opt.lr  # pylint: disable=pointless-statement
        self.evaluate(tf.compat.v1.global_variables_initializer())

        self.assertEqual(self.evaluate(opt.lr), 2.)
        self.assertEqual(self.evaluate(opt.inner_optimizer.momentum), 0.5)
        self.assertEqual(self.evaluate(opt.loss_scale), 2.)
        self.assertEqual(opt.dynamic_growth_steps, 3.)
        self.assertTrue(opt.dynamic, 4.)
        # Deserializing a LossScaleOptimizer always always results in a V2
        # LossScaleOptimizer, even if serialized with a LossScaleOptimizerV1.
        self.assertAllEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)

        # Ensure the optimizer can be used
        var = tf.Variable([5.0])
        run_op = self._run_fn_with_grad_check(tf.distribute.get_strategy(),
                                              var, opt, 2)()
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self._run_if_in_graph_mode(run_op)
        self.assertEqual(self.evaluate(var), [3.])
        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
示例#19
0
  def test_save_slot_variables_with_autocast_vars(self,
                                                  strategy_fn,
                                                  var_name='v'):
    p = policy.Policy('mixed_float16')
    with strategy_fn().scope(), policy.policy_scope(p):
      x = layers.Input(shape=(2,), batch_size=2)
      # Having a var_name other than 'v' tests that a fixed bug (b/134713714)
      # does not reoccur. The bug was that a crash would occur when saving a
      # checkpoint where an AutoCastVariable with a slot variable would have a
      # different name than the layer attribute's name (layer.v in this case).
      layer = mp_test_util.MultiplyLayer(assert_type=tf.float16,
                                         var_name=var_name)
      y = layer(x)
      model = models.Model(inputs=x, outputs=y)
      opt = gradient_descent.SGD(1., 1.)
      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
                                                    initial_scale=1)
      model.compile(
          optimizer=opt,
          loss='mse',
          run_eagerly=testing_utils.should_run_eagerly())

    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
    weights_file = os.path.join(self.get_temp_dir(), 'weights')
    model.save_weights(weights_file)
    saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))

    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
    new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
    self.assertNotEqual(new_slot, saved_slot)

    model.load_weights(weights_file)
    restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
    self.assertEqual(restored_slot, saved_slot)
    def get_model(self,
                  initial_weights=None,
                  distribution=None,
                  input_shapes=None):
        del input_shapes
        with keras_correctness_test_base.MaybeDistributionScope(distribution):
            image = keras.layers.Input(shape=(28, 28, 3), name='image')
            c1 = keras.layers.Conv2D(
                name='conv1',
                filters=16,
                kernel_size=(3, 3),
                strides=(4, 4),
                kernel_regularizer=keras.regularizers.l2(1e-4))(image)
            if self.with_batch_norm == 'regular':
                c1 = keras.layers.BatchNormalization(name='bn1')(c1)
            elif self.with_batch_norm == 'sync':
                # Test with parallel batch norms to verify all-reduce works OK.
                bn1 = keras.layers.SyncBatchNormalization(name='bn1')(c1)
                bn2 = keras.layers.SyncBatchNormalization(name='bn2')(c1)
                c1 = keras.layers.Add()([bn1, bn2])
            c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
            logits = keras.layers.Dense(10, activation='softmax', name='pred')(
                keras.layers.Flatten()(c1))
            model = keras.Model(inputs=[image], outputs=[logits])

            if initial_weights:
                model.set_weights(initial_weights)

            model.compile(optimizer=gradient_descent.SGD(learning_rate=0.1),
                          loss='sparse_categorical_crossentropy',
                          metrics=['sparse_categorical_accuracy'])

        return model
示例#21
0
    def testSparseBasicWithLearningRateDecay(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in [tf.half, tf.float32, tf.float64]:
                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
                var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
                grads0 = tf.IndexedSlices(
                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
                    tf.constant([0]), tf.constant([2, 1]))
                grads1 = tf.IndexedSlices(
                    tf.constant([0.01], shape=[1, 1], dtype=dtype),
                    tf.constant([1]), tf.constant([2, 1]))
                sgd_op = gradient_descent.SGD(3.0, decay=0.5).apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())
                # Run 2 steps of sgd
                self.evaluate(sgd_op)
                # Validate updated params
                self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                                   self.evaluate(var1))

                self.evaluate(sgd_op)
                # Validate updated params
                self.assertAllCloseAccordingToType(
                    [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]],
                    self.evaluate(var0))
                self.assertAllCloseAccordingToType(
                    [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]],
                    self.evaluate(var1))
示例#22
0
 def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
     for dtype in [tf.half, tf.float32, tf.float64]:
         learning_rate = learning_rate_schedule.InverseTimeDecay(
             3.0, decay_steps=1.0, decay_rate=0.5)
         sgd = gradient_descent.SGD(learning_rate=learning_rate)
         sgd = gradient_descent.SGD.from_config(sgd.get_config())
         self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
示例#23
0
 def testConfig(self):
     opt = gradient_descent.SGD(learning_rate=1.0,
                                momentum=0.9,
                                nesterov=True)
     config = opt.get_config()
     opt2 = gradient_descent.SGD.from_config(config)
     lr = opt.lr
     lr2 = opt2.lr
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
     self.assertAllClose(self.evaluate(opt._get_hyper("momentum")),
                         self.evaluate(opt2._get_hyper("momentum")))
     self.assertAllClose(self.evaluate(opt._get_hyper("decay")),
                         self.evaluate(opt2._get_hyper("decay")))
     var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
     loss = lambda: 3 * var0
     # learning rate variable created when calling minimize.
     opt.minimize(loss, [var0])
     self.evaluate(tf.compat.v1.global_variables_initializer())
     config = opt.get_config()
     opt3 = gradient_descent.SGD.from_config(config)
     lr3 = opt3.lr
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
     self.assertAllClose(self.evaluate(opt._get_hyper("momentum")),
                         self.evaluate(opt3._get_hyper("momentum")))
     self.assertAllClose(self.evaluate(opt._get_hyper("decay")),
                         self.evaluate(opt3._get_hyper("decay")))
     self.assertTrue(opt3.nesterov)
示例#24
0
 def test_wide_deep_model_with_two_feature_columns(self):
     vocab_list = ['alpha', 'beta', 'gamma']
     vocab_val = [0.4, 0.6, 0.9]
     data = np.random.choice(vocab_list, size=256)
     y = np.zeros_like(data, dtype=np.float32)
     for vocab, val in zip(vocab_list, vocab_val):
         indices = np.where(data == vocab)
         y[indices] = val + np.random.uniform(
             low=-0.01, high=0.01, size=indices[0].shape)
     cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
         key='symbol', vocabulary_list=vocab_list)
     ind_column = tf.feature_column.indicator_column(cat_column)
     emb_column = tf.feature_column.embedding_column(cat_column,
                                                     dimension=5)
     linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
     linear_model = linear.LinearModel(use_bias=False,
                                       kernel_initializer='zeros')
     combined_linear = sequential.Sequential(
         [linear_feature_layer, linear_model])
     dnn_model = sequential.Sequential([core.Dense(units=1)])
     dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
     combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
     wide_deep_model = wide_deep.WideDeepModel(combined_linear,
                                               combined_dnn)
     opt = gradient_descent.SGD(learning_rate=0.1)
     wide_deep_model.compile(opt,
                             'mse', [],
                             run_eagerly=testing_utils.should_run_eagerly())
     wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
示例#25
0
 def testBasicWithLearningRateDecay(self):
     for dtype in [tf.half, tf.float32, tf.float64]:
         learning_rate = 3.0
         decay = 0.5
         sgd = gradient_descent.SGD(learning_rate=learning_rate,
                                    decay=decay)
         self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
示例#26
0
    def test_custom_aggregation(self, distribution,
                                experimental_aggregate_gradients, expected):

        with distribution.scope():
            v = tf.Variable([0., 0.])
            optimizer = gradient_descent.SGD(0.1)

        class PerReplica(values.DistributedValues):
            """Holds a map from replica to unsynchronized values."""
            @property
            def values(self):
                """Returns the per replica values."""
                return self._values

        @tf.function
        def optimize():
            with tf.compat.v1.device(distribution.extended.worker_devices[0]):
                v1 = tf.convert_to_tensor([1., 1.])
            with tf.compat.v1.device(distribution.extended.worker_devices[1]):
                v2 = tf.convert_to_tensor([2., 2.])
            grads = PerReplica([v1, v2])

            def step_fn(grads):
                optimizer.apply_gradients([(grads, v)],
                                          experimental_aggregate_gradients=
                                          experimental_aggregate_gradients)
                return v.read_value()

            return distribution.experimental_local_results(
                distribution.run(step_fn, args=(grads, )))

        self.assertAllClose(optimize(), expected)
示例#27
0
  def test_model_with_fixed_input_dim(self):
    """Ensure that the batch_dim is removed when saving.

    When serving or retraining, it is important to reset the batch dim.
    This can be an issue inside of tf.function. See b/132783590 for context.
    """
    model = testing_utils.get_small_mlp(10, 3, 5)

    loss_object = keras.losses.MeanSquaredError()
    optimizer = gradient_descent.SGD()

    @tf.function
    def train_step(data, labels):
      with tf.GradientTape() as tape:
        predictions = model(data)
        loss = loss_object(labels, predictions)
      gradients = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    x = np.random.random((8, 5))
    y = np.random.random((8, 3))

    train_step(x, y)

    fn = saving_utils.trace_model_call(model)
    self.assertEqual(fn.input_signature[0].shape.as_list(),
                     tf.TensorShape([None, 5]).as_list())
示例#28
0
  def test_variable_run_argument(self, distribution):
    # Test that variables passed to run() remain variables. Previous behavior
    # in TPUStrategy was to cast to Tensor.

    with distribution.scope():
      optimizer = gradient_descent.SGD(0.1)
      net = core.Dense(1, trainable=True)
    dataset = tf.data.Dataset.from_tensors([[1.]])
    dataset = dataset.repeat()
    dataset = dataset.batch(2, drop_remainder=True)

    def replica_step(trainable_variables, features):

      with tf.GradientTape() as tape:
        net_out = net(features[0], training=True)
        loss = (net_out - 1.0) * (net_out - 1.0)
      gradients = tape.gradient(loss, trainable_variables)
      optimizer.apply_gradients(zip(gradients, trainable_variables))
      return loss

    @tf.function
    def step(features):
      per_replica_losses = distribution.run(
          replica_step,
          (net.trainable_variables, features),
      )
      loss = distribution.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
      return loss

    step(next(iter(dataset)))
示例#29
0
 def createTestModel(self, compile_model):
     model = keras.Sequential([keras.layers.Dense(10)])
     if compile_model:
         model.compile(gradient_descent.SGD(),
                       loss='mse',
                       metrics=keras.metrics.CategoricalAccuracy())
     return model
示例#30
0
    def test_wide_deep_model(self, distribution, use_dataset_creator, data_fn):
        if ((not use_dataset_creator) and isinstance(
                distribution,
                tf.distribute.experimental.ParameterServerStrategy)):
            self.skipTest(
                'Parameter Server strategy requires dataset creator to be used in '
                'model.fit.')
        if (not tf.__internal__.tf2.enabled() and use_dataset_creator
                and isinstance(
                    distribution,
                    tf.distribute.experimental.ParameterServerStrategy)):
            self.skipTest(
                'Parameter Server strategy with dataset creator needs to be run when '
                'eager execution is enabled.')
        with distribution.scope():
            linear_model = linear.LinearModel(units=1)
            dnn_model = sequential.Sequential([core.Dense(units=1)])
            wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
            linear_opt = gradient_descent.SGD(learning_rate=0.05)
            dnn_opt = adagrad.Adagrad(learning_rate=0.1)
            wide_deep_model.compile(optimizer=[linear_opt, dnn_opt],
                                    loss='mse')

            if use_dataset_creator:
                x = dataset_creator.DatasetCreator(dataset_fn)
                hist = wide_deep_model.fit(x,
                                           epochs=5,
                                           steps_per_epoch=INPUT_SIZE)
            else:
                if data_fn == 'numpy':
                    inputs, output = get_numpy()
                    hist = wide_deep_model.fit(inputs, output, epochs=5)
                else:
                    hist = wide_deep_model.fit(get_dataset(), epochs=5)
            self.assertLess(hist.history['loss'][4], 0.2)