예제 #1
0
  def testAccumulator(self):
    # testAccumulator compares
    #   - explicit averaging of independently computed var_grads1 and
    #     var_grads2,
    #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
    np.random.seed(12345)
    np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
    np.random.seed(12346)
    np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

    tf.random.set_seed(123456)
    params = layers.ProjectionLayer.Params()
    params.name = 'proj'
    params.dtype = tf.float64
    params.input_dim = 3
    params.output_dim = 2
    params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

    params.batch_norm = False
    proj_layer = layers.ProjectionLayer(params)
    inputs1 = np_input1
    in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
    inputs2 = np_input2
    in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)

    op = optimizer.SGD.Params()
    opt = op.Instantiate()
    # Get `snapshots` of the variables
    vars1 = [v.read_value() for v in proj_layer.vars.Flatten()]

    lr = lambda: 1e-1

    @tf.function
    def _Apply1(proj_layer, opt):
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)

      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

      vars1_1 = proj_layer.vars.Flatten()

      grads1_1 = var_grads1.Transform(tuple)
      grads1_2 = var_grads2.Transform(tuple)

      return vars1_1, grads1_1, grads1_2

    vars1_1, grads1_1, grads1_2 = _Apply1(proj_layer, opt)

    tf.random.set_seed(123456)
    params = layers.ProjectionLayer.Params()
    params.name = 'proj2'
    params.dtype = tf.float64
    params.input_dim = 3
    params.output_dim = 2
    params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

    params.batch_norm = False
    proj_layer = layers.ProjectionLayer(params)
    in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)

    op = optimizer.Accumulator.Params().Set(
        accum_steps=2, dtype=tf.float64, optimizer_tpl=optimizer.SGD.Params())
    opt = op.Instantiate()
    # Get `snapshots` of the variables
    vars2 = [v.read_value() for v in proj_layer.vars.Flatten()]

    @tf.function
    def _Apply2(proj_layer, opt):
      inputs1 = np_input1
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss2_1 = tf.reduce_sum(output1)
      var_grads2_1 = py_utils.ComputeGradients(loss2_1, proj_layer.vars)
      grads2_1 = var_grads2_1.Transform(tuple)

      inputs1 = np_input2
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss2_2 = tf.reduce_sum(output1)
      var_grads2_2 = py_utils.ComputeGradients(loss2_2, proj_layer.vars)
      grads2_2 = var_grads2_2.Transform(tuple)

      with cluster_factory.ForTestingWorker(add_summary=True):
        _ = opt.Apply(lr, var_grads2_1)

      # Get `snapshots` of the intermediate variables
      vars2_intermediate = [v.read_value() for v in proj_layer.vars.Flatten()]
      tf.assign_add(py_utils.GetOrCreateGlobalStepVar(), 1)

      with cluster_factory.ForTestingWorker(add_summary=True):
        _ = opt.Apply(lr, var_grads2_2)

      vars2_1 = proj_layer.vars.Flatten()

      return vars2_intermediate, vars2_1, grads2_1, grads2_2

    vars2_intermediate, vars2_1, grads2_1, grads2_2 = _Apply2(proj_layer, opt)
    # Unlike Graph mode, grads2_1['w'][0]/grads2_2['w'][0] returned from
    # `tf.function` are variables after updates. As a result we cannot compare
    # them with e.g. `vars1`.

    self.assertAllClose(vars1, vars2)

    self.assertAllClose(grads1_1, grads2_1)
    self.assertAllClose(grads1_2, grads2_2)

    self.assertAllClose(vars1, vars2_intermediate)

    lr = lr()
    self.assertAllClose(
        vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0])
    self.assertAllClose(
        vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0])

    self.assertAllClose(vars2, vars2_intermediate)
    self.assertAllClose(vars1_1, vars2_1)
예제 #2
0
    def testAccumulator(self):
        # testAccumulator compares
        #   - explicit averaging of independently computed var_grads1 and
        #     var_grads2,
        #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
        np.random.seed(12345)
        np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
        np.random.seed(12346)
        np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
            loss1 = tf.reduce_sum(output1)
            loss2 = tf.reduce_sum(output2)
            var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
            var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
            op = optimizer.SGD.Params()
            opt = op.Instantiate()
            lr = 1e-1
            with tf.control_dependencies([loss1, loss2]):
                var_update_op1 = opt.Apply(
                    lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
                with tf.control_dependencies([var_update_op1]):
                    var_update_op2 = opt.Apply(
                        lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

            self.evaluate(tf.global_variables_initializer())
            vars1 = self.evaluate(proj_layer.vars.Flatten())
            loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
                [
                    loss1,
                    var_grads1.Transform(tuple), loss2,
                    var_grads2.Transform(tuple)
                ],
                feed_dict={
                    inputs1: np_input1,
                    inputs2: np_input2,
                },
            )
            sess.run([var_update_op2],
                     feed_dict={
                         inputs1: np_input1,
                         inputs2: np_input2,
                     })
            vars1_1 = self.evaluate(proj_layer.vars.Flatten())

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            loss = tf.reduce_sum(output1)
            var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
            op = optimizer.Accumulator.Params().Set(
                accum_steps=2,
                dtype=tf.float64,
                optimizer_tpl=optimizer.SGD.Params())
            opt = op.Instantiate()
            lr = 1e-1
            with cluster_factory.ForTestingWorker(add_summary=True):
                var_update_op = opt.Apply(lr, var_grads)
            increment_global_step_op = tf.assign_add(
                py_utils.GetOrCreateGlobalStepVar(), 1)

            self.evaluate(tf.global_variables_initializer())
            vars2 = self.evaluate(proj_layer.vars.Flatten())
            loss2_1, grads2_1 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input1,
                })
            loss2_2, grads2_2 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input2,
                })
            acc_0 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            sess.run([var_update_op], feed_dict={
                inputs1: np_input1,
            })
            acc_1 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_intermediate = self.evaluate(proj_layer.vars.Flatten())
            self.evaluate(increment_global_step_op)
            sess.run([var_update_op], feed_dict={
                inputs1: np_input2,
            })
            acc_2 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_1 = self.evaluate(proj_layer.vars.Flatten())

            summary = tf.Summary.FromString(
                self.evaluate(tf.summary.merge_all()))
            tf.logging.info(f'summary: {summary}')
            self.assertEqual(summary.value[0].tag, 'sgd_lr')

        self.assertAllClose(vars1, vars2)

        self.assertAllClose(acc_0, np.zeros_like(acc_0))
        self.assertAllClose(acc_1, grads2_1['w'][1])
        self.assertAllClose(acc_2, np.zeros_like(acc_0))

        self.assertAllClose(loss1_1, loss2_1)
        self.assertAllClose(loss1_2, loss2_2)
        self.assertAllClose(grads1_1, grads2_1)
        self.assertAllClose(grads1_2, grads2_2)

        self.assertAllClose(vars1, vars2_intermediate)

        self.assertAllClose(vars2[0], grads2_1['w'][0])
        self.assertAllClose(vars2[0], grads2_2['w'][0])

        self.assertAllClose(
            vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]),
            vars1_1[0])

        self.assertAllClose(
            vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]),
            vars2_1[0])

        self.assertAllClose(vars2, vars2_intermediate)
        self.assertAllClose(vars1_1, vars2_1)
예제 #3
0
  def testAccumulator(self):
    # testAccumulator compares
    #   - explicit averaging of independently computed var_grads1 and
    #     var_grads2,
    #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
    np.random.seed(12345)
    np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
    np.random.seed(12346)
    np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

    g1 = tf.Graph()
    with g1.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
      op = optimizer.SGD.Params().Set(add_summary=False)
      opt = op.cls(op)
      lr = 1e-1
      with tf.control_dependencies([loss1, loss2]):
        var_update_op1 = opt.Apply(
            lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
        with tf.control_dependencies([var_update_op1]):
          var_update_op2 = opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))
      init_op = tf.global_variables_initializer()

    with self.session(use_gpu=True, graph=g1) as sess:
      sess.run(init_op)
      vars1 = sess.run(proj_layer.vars.Flatten())
      loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
          [loss1, var_grads1, loss2, var_grads2],
          feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      sess.run(
          [var_update_op2], feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      vars1_1 = sess.run(proj_layer.vars.Flatten())

    g2 = tf.Graph()
    with g2.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss = tf.reduce_sum(output1)
      var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
      op = optimizer.Accumulator.Params().Set(
          accum_steps=2,
          dtype=tf.float64,
          optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False))
      opt = op.cls(op)
      lr = 1e-1
      var_update_op = opt.Apply(lr, var_grads)
      init_op = tf.global_variables_initializer()
      global_step = py_utils.GetOrCreateGlobalStep()
      increment_global_step_op = tf.assign_add(global_step, 1)
    with self.session(use_gpu=True, graph=g2) as sess:
      sess.run(init_op)
      vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step])
      loss2_1, grads2_1 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input1,
          })
      loss2_2, grads2_2 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input2,
          })
      acc_0 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input1,
          })
      acc_1 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_intermediate = sess.run(proj_layer.vars.Flatten())
      sess.run(increment_global_step_op)
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input2,
          })
      acc_2 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_1 = sess.run(proj_layer.vars.Flatten())

    self.assertAllClose(vars1, vars2)

    self.assertAllClose(acc_0, np.zeros_like(acc_0))
    self.assertAllClose(acc_1, grads2_1['w'][1])
    self.assertAllClose(acc_2, np.zeros_like(acc_0))

    self.assertAllClose(loss1_1, loss2_1)
    self.assertAllClose(loss1_2, loss2_2)
    self.assertAllClose(grads1_1, grads2_1)
    self.assertAllClose(grads1_2, grads2_2)

    self.assertAllClose(vars1, vars2_intermediate)

    self.assertAllClose(vars2[0], grads2_1['w'][0])
    self.assertAllClose(vars2[0], grads2_2['w'][0])

    self.assertAllClose(
        vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0])

    self.assertAllClose(
        vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0])

    self.assertAllClose(vars2, vars2_intermediate)
    self.assertAllClose(vars1_1, vars2_1)