Exemplo n.º 1
0
    def _Apply2(proj_layer, opt):
      inputs1 = np_input1
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss2_1 = tf.reduce_sum(output1)
      var_grads2_1 = py_utils.ComputeGradients(loss2_1, proj_layer.vars)
      grads2_1 = var_grads2_1.Transform(tuple)

      inputs1 = np_input2
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss2_2 = tf.reduce_sum(output1)
      var_grads2_2 = py_utils.ComputeGradients(loss2_2, proj_layer.vars)
      grads2_2 = var_grads2_2.Transform(tuple)

      with cluster_factory.ForTestingWorker(add_summary=True):
        _ = opt.Apply(lr, var_grads2_1)

      # Get `snapshots` of the intermediate variables
      vars2_intermediate = [v.read_value() for v in proj_layer.vars.Flatten()]
      tf.assign_add(py_utils.GetOrCreateGlobalStepVar(), 1)

      with cluster_factory.ForTestingWorker(add_summary=True):
        _ = opt.Apply(lr, var_grads2_2)

      vars2_1 = proj_layer.vars.Flatten()

      return vars2_intermediate, vars2_1, grads2_1, grads2_2
Exemplo n.º 2
0
 def testMaskGradient(self):
   with self.session(use_gpu=False) as sess:
     a = tf.get_variable('a', [])
     b = tf.get_variable('b', [])
     c = tf.get_variable('c', [])
     d = tf.get_variable('d', [])
     e = tf.get_variable('e', [])
     l = a + b + c + d
     zeros = tf.zeros(3, dtype=tf.float32)
     select = tf.one_hot(1, 3, dtype=tf.float32)
     vmap = py_utils.NestedMap(
         a=a, b=b, c=c, d=d, n=py_utils.NestedMap(aa=a, e=e))
     grad_mask = py_utils.NestedMap()
     grad_mask['a:0'] = zeros
     grad_mask['b:0'] = zeros
     grad_mask['c:0'] = select
     grad_mask['d:0'] = select
     grad_onehot = tf.one_hot(1, 3, dtype=tf.float32)
     var_grads = py_utils.ComputeGradients(l, vmap)
     var_grads_mask = py_utils.MaskGradients(var_grads, grad_mask, grad_onehot)
     sess.run(tf.global_variables_initializer())
     _, var_grads_mask_vals = sess.run([var_grads, var_grads_mask])
     # 'a' and 'b' are masked, while 'c' and 'd' are not.
     self.assertEqual(var_grads_mask_vals['a'][1], 0)
     self.assertEqual(var_grads_mask_vals['b'][1], 0)
     self.assertEqual(var_grads_mask_vals['c'][1], 1)
     self.assertEqual(var_grads_mask_vals['d'][1], 1)
Exemplo n.º 3
0
 def testGradientMult(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars)
         py_utils.ApplyGradMultiplier(var_grads, -1.1)
Exemplo n.º 4
0
 def testCollectVarHistogram(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars)
         summary_utils.CollectVarHistogram(var_grads)
Exemplo n.º 5
0
  def testSkipL1Regularization(self):
    with self.session(use_gpu=False) as sess:
      beta = tf.get_variable(
          'beta',
          initializer=tf.constant(np.arange(10).reshape([1, 10]), tf.float32))
      tf.add_to_collection(py_utils.SKIP_LP_REGULARIZATION, beta)
      gamma = tf.get_variable(
          'gamma',
          initializer=tf.constant(np.arange(10).reshape([1, 10]), tf.float32))
      act = tf.constant(np.arange(10).reshape([1, 10]), tf.float32)
      pred = act * gamma + beta
      loss = tf.reduce_sum(pred)
      vmap = py_utils.NestedMap(beta=beta, gamma=gamma)
      var_grads = py_utils.ComputeGradients(loss, vmap)
      self.assertEqual(sorted(var_grads.keys()), ['beta', 'gamma'])
      l1_loss, var_grads_with_l1 = py_utils.AdjustGradientsWithLpLoss(
          var_grads, 0.1, p=1.0)

      sess.run(tf.global_variables_initializer())
      var_grads_vals, l1_loss_val, var_grads_with_l1_vals = sess.run(
          [var_grads, l1_loss, var_grads_with_l1])
      print('var_grads_vals = ', var_grads_vals)
      print('var_grads_with_l1_vals = ', var_grads_with_l1_vals)
      self.assertAllEqual(var_grads_vals.beta[0],
                          var_grads_with_l1_vals.beta[0])
      self.assertAllEqual(var_grads_vals.gamma[0],
                          var_grads_with_l1_vals.gamma[0])
      self.assertAllEqual(l1_loss_val,
                          0.1 * np.sum(np.abs(var_grads_vals.gamma[0])))
Exemplo n.º 6
0
    def _Apply1(proj_layer, opt):
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)

      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
      _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

      vars1_1 = proj_layer.vars.Flatten()

      grads1_1 = var_grads1.Transform(tuple)
      grads1_2 = var_grads2.Transform(tuple)

      return vars1_1, grads1_1, grads1_2
Exemplo n.º 7
0
 def _FpropBprop(self, fc_layer, opt):
   inputs = tf.zeros(shape=[2, 4, 3], dtype=tf.float64)
   output = fc_layer.FPropDefaultTheta(inputs)
   loss = tf.reduce_sum(output)
   var_grads = py_utils.ComputeGradients(loss, fc_layer.vars)
   # Name becomes meaningless in Eager mode. Here we just check whether
   # errors get raised.
   update_op = opt.Apply(1e-1, var_grads)
   self.assertIn('composite_optimizer_train_op', update_op.name)
Exemplo n.º 8
0
 def testComputeGradient(self):
   with self.session(use_gpu=False):
     a = tf.get_variable('a', [])
     b = tf.get_variable('b', [], trainable=False)
     c = tf.get_variable('c', [])
     e = tf.get_variable('e', [])
     l = a + b + tf.stop_gradient(c)
     vmap = py_utils.NestedMap(
         a=a, b=b, c=c, d=None, n=py_utils.NestedMap(aa=a, e=e))
     var_grads = py_utils.ComputeGradients(l, vmap)
     print('var_grads = ', var_grads.DebugString())
     # Only 'a' matters. b is not trainable; c has stop_gradient; d
     # is None; e is not computed by l and aa is a duplicated.
     self.assertEqual([_[0] for _ in var_grads.FlattenItems()], ['a'])
     self.assertEqual(var_grads.a[0].name, 'a:0')
Exemplo n.º 9
0
    def testCompositeOptimizer(self):
        adam_op = optimizer.Adam.Params()
        rmsprop_op = optimizer.RMSProp.Params()
        adam_rmsprop_opt = optimizer.CompositeOptimizer.Params().Set(
            optimizer_map={
                'fc/w': (adam_op, 1.),
                'fc/b': (rmsprop_op, 1.),
                'default_optimizer': (adam_op, 1.)
            }).Instantiate()

        adam_op_2 = optimizer.Adam.Params().Set(name='adam_2')
        unspecified_comp_opt = optimizer.CompositeOptimizer.Params().Set(
            optimizer_map={
                'fc/w': (adam_op_2, 1.),
                'default_optimizer': (adam_op_2, 1.)
            }).Instantiate()

        sgd_op = optimizer.SGD.Params()
        adagrad_op = optimizer.Adagrad.Params()
        overlapping_comp_opt = optimizer.CompositeOptimizer.Params().Set(
            optimizer_map={
                'fc/w': (sgd_op, 1.),
                '.': (adagrad_op, 1.),
                'default_optimizer': (adagrad_op, 1.)
            }).Instantiate()

        params = layers.FCLayer.Params()
        params.name = 'fc'
        params.dtype = tf.float64
        params.input_dim = 3
        params.output_dim = 2
        params.batch_norm = False
        fc_layer = layers.FCLayer(params)

        inputs = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
        output = fc_layer.FPropDefaultTheta(inputs)
        loss = tf.reduce_sum(output)
        var_grads = py_utils.ComputeGradients(loss, fc_layer.vars)

        self.assertIn('composite_optimizer_train_op',
                      adam_rmsprop_opt.Apply(1e-1, var_grads).name)
        self.assertIn('composite_optimizer_train_op',
                      unspecified_comp_opt.Apply(1e-1, var_grads).name)
        with self.assertRaisesRegex(
                Exception,
                'Variable fc/w/var:0 is matched 2 times by regex',
        ):
            overlapping_comp_opt.Apply(1e-1, var_grads)
Exemplo n.º 10
0
  def testAdjustGradientsWithL2Loss(self):
    with self.session(use_gpu=False) as sess:
      emb = tf.get_variable(
          'emb',
          initializer=tf.constant(np.arange(100).reshape([10, 10]), tf.float32))
      act = tf.gather(emb, [2, 5, 2, 2, 5])
      weight = tf.get_variable(
          'w', initializer=tf.constant(np.ones([10, 1]), tf.float32))
      bias = tf.get_variable('b', initializer=tf.constant([0.217]))
      pred = tf.matmul(act, weight) + tf.stop_gradient(bias)
      loss = tf.reduce_sum(pred)
      vmap = py_utils.NestedMap(emb=emb, weight=weight, bias=bias)
      var_grads = py_utils.ComputeGradients(loss, vmap)
      self.assertEqual(sorted(var_grads.keys()), ['emb', 'weight'])
      l2_loss, var_grads_with_l2 = py_utils.AdjustGradientsWithLpLoss(
          var_grads, 0.1, p=2.0)

      sess.run(tf.global_variables_initializer())
      var_grads_vals, l2_loss_val, var_grads_with_l2_vals = sess.run(
          [var_grads, l2_loss, var_grads_with_l2])
      print('var_grads_vals = ', var_grads_vals)
      print('var_grads_with_l2_vals = ', var_grads_with_l2_vals)
      self.assertAllEqual(var_grads_vals.emb[0], var_grads_with_l2_vals.emb[0])
      self.assertAllEqual(var_grads_vals.weight[0],
                          var_grads_with_l2_vals.weight[0])
      self.assertAllEqual(
          l2_loss_val,
          0.5 * 0.1 * (np.sum(np.square(var_grads_vals.weight[0])) + np.sum(
              np.square(var_grads_vals.emb[0][2, :])) + np.sum(
                  np.square(var_grads_vals.emb[0][5, :]))))

      # With l2, gradients of emb and weight are adjusted.
      self.assertAllClose(
          var_grads_with_l2_vals.weight[1],
          var_grads_vals.weight[1] + 0.1 * var_grads_vals.weight[0])
      self.assertAllClose(var_grads_with_l2_vals.emb[1].indices,
                          var_grads_vals.emb[1].indices)
      self.assertAllClose(var_grads_with_l2_vals.emb[1].indices,
                          [2, 5, 2, 2, 5])
      self.assertAllClose(
          var_grads_with_l2_vals.emb[1].values, var_grads_vals.emb[1].values +
          0.1 * np.array([[1 / 3.], [1 / 2.], [1 / 3.], [1 / 3.], [1 / 2.]
                         ]) * var_grads_vals.emb[0][[2, 5, 2, 2, 5], :])
Exemplo n.º 11
0
 def testRematerialize(self):
     # Test the dropout consistency between fprop and bprop.
     b = builder.Base.Params()
     b = b.Instantiate()
     start_block = layers.DeterministicDropoutLayer.Params().Set(
         name='start_dropout', keep_prob=0.7)
     # Build 4 dropout layers, each wrapped by RematerializeFn.
     num_blocks = 4
     blocks = []
     blocks_per_cell = 2
     for i in range(num_blocks):
         blocks.append(layers.DeterministicDropoutLayer.Params().Set(
             name='dropout_{}'.format(i), keep_prob=0.7))
     cells = []
     while blocks:
         heads, blocks = blocks[:blocks_per_cell], blocks[blocks_per_cell:]
         cell_name = 'cell_{}'.format(len(cells))
         cells.append(
             b._Rematerialize(name=cell_name,
                              body=b._Seq(cell_name, *heads)))
     with self.session(use_gpu=False, graph=tf.Graph()) as sess:
         tf.random.set_seed(12345)
         p = b._Seq('test', start_block, *cells)
         mdl = p.Instantiate()
         # y = mdl.Frop(x * w)
         # Fake input
         x = tf.ones([4, 5])
         # Construct weights.
         w = tf.get_variable('w',
                             shape=[4, 5],
                             initializer=tf.constant_initializer([[1] * 5] *
                                                                 4))
         y = mdl.FPropDefaultTheta(x * w)
         # Construct loss function such that gradients = final activation.
         # dy/dw = y = mdl.Frop(x * w) when w is 1.
         loss = tf.reduce_sum(y)
         grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w))
         tf.global_variables_initializer().run()
         y_val, grads_val = sess.run([y, grads.Transform(tuple)])
         grads_val = grads_val['w'][1]
         self.assertAllClose(y_val, grads_val)
         self.assertEqual(py_utils.GetStepSeed().eval(), 1553244033)
Exemplo n.º 12
0
 def testDropoutInRecurrent(self, splits=1, num_micro_batches=1):
   assert splits in [1, 2, 4]
   with self.session() as sess:
     tf.set_random_seed(12345)
     num_layers = 4
     py_utils.GetOrCreateGlobalStep()
     # Build a model with 4 dropout layers.
     layers = []
     for l in range(num_layers):
       layers.append(DeterministicDropoutLayer.Params().Set(
           name='dropout_{}'.format(l), keep_prob=0.7))
     # Divide the model into splits partitions.
     cell_tpl = []
     layers_per_split = num_layers // splits
     for i in range(splits):
       sub = layers[i * layers_per_split:(i + 1) * layers_per_split]
       cell_tpl.append(FeatureExtractionLayer.Params().Set(
           name='cell_{}'.format(i), sub=sub))
     # Parallelize partitions using pipeline.
     p = PipeliningLayer.Params().Set(
         name='pipeline',
         num_micro_batches=num_micro_batches,
         cell_tpl=cell_tpl)
     # Fake input
     x = tf.ones([2, 3])
     # Construct weights.
     w = tf.get_variable(
         'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2))
     mdl = p.cls(p)
     y = mdl.FPropDefaultTheta(x * w)
     # Construct loss function such that gradients = final activation.
     loss = tf.reduce_sum(y)
     grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w))
     tf.global_variables_initializer().run()
     y_val = sess.run(y)
     grads_val = sess.run(grads)['w'][1]
     self.assertAllClose(y_val, grads_val)
Exemplo n.º 13
0
    def testDropoutInRecurrent(self, graph_seed):
        with self.session() as sess:
            if graph_seed:
                tf.random.set_seed(12345)
            l = lingvo_layers.DeterministicDropoutLayer.Params().Set(
                name='dropout', keep_prob=0.7).Instantiate()
            # Input variable.
            w = tf.get_variable('w',
                                shape=[9, 20],
                                initializer=tf.ones_initializer())
            sess.run(tf.global_variables_initializer())
            prev_sum = np.sum(np.isclose(sess.run(w), 0.0))

            def Step(theta, state0, unused_inputs):
                w = l.FProp(theta.l, state0.w)
                state1 = py_utils.NestedMap(w=w)
                return state1, py_utils.NestedMap()

            acc, final = recurrent.Recurrent(
                theta=py_utils.NestedMap(l=l.theta),
                state0=py_utils.NestedMap(w=w),
                inputs=py_utils.NestedMap(x=tf.zeros([4])),
                cell_fn=Step)

            acc_w = sess.run(acc.w)
            self.assertLen(acc_w, 4)
            for acc_w_i in acc_w:
                next_sum = np.sum(np.isclose(acc_w_i, 0.0))
                self.assertGreater(next_sum, prev_sum)
                prev_sum = next_sum

            # Construct loss function such that gradients = final activation.
            loss = tf.reduce_sum(final.w)
            grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w))
            w_val, grads_val = sess.run([final.w, grads.w.grad])
            self.assertAllClose(w_val, grads_val)
Exemplo n.º 14
0
    def testAccumulator(self):
        # testAccumulator compares
        #   - explicit averaging of independently computed var_grads1 and
        #     var_grads2,
        #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
        np.random.seed(12345)
        np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
        np.random.seed(12346)
        np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
            loss1 = tf.reduce_sum(output1)
            loss2 = tf.reduce_sum(output2)
            var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
            var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
            op = optimizer.SGD.Params()
            opt = op.Instantiate()
            lr = 1e-1
            with tf.control_dependencies([loss1, loss2]):
                var_update_op1 = opt.Apply(
                    lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
                with tf.control_dependencies([var_update_op1]):
                    var_update_op2 = opt.Apply(
                        lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))

            self.evaluate(tf.global_variables_initializer())
            vars1 = self.evaluate(proj_layer.vars.Flatten())
            loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
                [
                    loss1,
                    var_grads1.Transform(tuple), loss2,
                    var_grads2.Transform(tuple)
                ],
                feed_dict={
                    inputs1: np_input1,
                    inputs2: np_input2,
                },
            )
            sess.run([var_update_op2],
                     feed_dict={
                         inputs1: np_input1,
                         inputs2: np_input2,
                     })
            vars1_1 = self.evaluate(proj_layer.vars.Flatten())

        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.random.set_seed(123456)
            params = layers.ProjectionLayer.Params()
            params.name = 'proj'
            params.dtype = tf.float64
            params.input_dim = 3
            params.output_dim = 2
            params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)

            params.batch_norm = False
            proj_layer = layers.ProjectionLayer(params)
            in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
            inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
            output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
            loss = tf.reduce_sum(output1)
            var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
            op = optimizer.Accumulator.Params().Set(
                accum_steps=2,
                dtype=tf.float64,
                optimizer_tpl=optimizer.SGD.Params())
            opt = op.Instantiate()
            lr = 1e-1
            with cluster_factory.ForTestingWorker(add_summary=True):
                var_update_op = opt.Apply(lr, var_grads)
            increment_global_step_op = tf.assign_add(
                py_utils.GetOrCreateGlobalStepVar(), 1)

            self.evaluate(tf.global_variables_initializer())
            vars2 = self.evaluate(proj_layer.vars.Flatten())
            loss2_1, grads2_1 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input1,
                })
            loss2_2, grads2_2 = sess.run(
                [loss, var_grads.Transform(tuple)],
                feed_dict={
                    inputs1: np_input2,
                })
            acc_0 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            sess.run([var_update_op], feed_dict={
                inputs1: np_input1,
            })
            acc_1 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_intermediate = self.evaluate(proj_layer.vars.Flatten())
            self.evaluate(increment_global_step_op)
            sess.run([var_update_op], feed_dict={
                inputs1: np_input2,
            })
            acc_2 = self.evaluate([
                v for v in tf.global_variables()
                if 'grad_accumulator' in v.name
            ])[0]
            vars2_1 = self.evaluate(proj_layer.vars.Flatten())

            summary = tf.Summary.FromString(
                self.evaluate(tf.summary.merge_all()))
            tf.logging.info(f'summary: {summary}')
            self.assertEqual(summary.value[0].tag, 'sgd_lr')

        self.assertAllClose(vars1, vars2)

        self.assertAllClose(acc_0, np.zeros_like(acc_0))
        self.assertAllClose(acc_1, grads2_1['w'][1])
        self.assertAllClose(acc_2, np.zeros_like(acc_0))

        self.assertAllClose(loss1_1, loss2_1)
        self.assertAllClose(loss1_2, loss2_2)
        self.assertAllClose(grads1_1, grads2_1)
        self.assertAllClose(grads1_2, grads2_2)

        self.assertAllClose(vars1, vars2_intermediate)

        self.assertAllClose(vars2[0], grads2_1['w'][0])
        self.assertAllClose(vars2[0], grads2_2['w'][0])

        self.assertAllClose(
            vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]),
            vars1_1[0])

        self.assertAllClose(
            vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]),
            vars2_1[0])

        self.assertAllClose(vars2, vars2_intermediate)
        self.assertAllClose(vars1_1, vars2_1)
Exemplo n.º 15
0
    def Apply(self, loss, vmap, gradient_mask=None, gradient_adjuster=None):
        """Computes updates on 'vmap' to optimize 'loss'.

    TODO(rpang): explore merging gradient_mask and gradient_adjuster.

    Args:
      loss: A scalar Tensor.
      vmap: A `.NestedMap` object containing variables to optimize.
      gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      (op, stats), where op is a tf.Operation to update variables and stats
      is a NestedMap containing 'has_nan_or_inf' and 'eval_metrics'.
    """
        # We apply gradients outside the name_scope to maintain backwards
        # compatibility on variables created by self.optimizer.Apply().
        p = self.params

        pos = re.compile(
            p.bprop_variable_filter) if p.bprop_variable_filter else None
        neg = re.compile(
            p.bprop_variable_exclusion) if p.bprop_variable_exclusion else None

        def VariableFilter(v):
            """Returns True if variable v should be optimized by this learner."""
            if pos and not pos.search(v.name):
                tf.logging.info('%s: disabled by bprop_variable_filter: %s',
                                p.name, v.name)
                return False
            if neg and neg.search(v.name):
                tf.logging.info('%s: disabled by bprop_variable_exclusion: %s',
                                p.name, v.name)
                return False
            return True

        vmap = vmap.Filter(VariableFilter)
        for v in vmap.Flatten():
            tf.logging.info('%s: bprop variable: %s', p.name, v.name)

        # Compute gradients.
        var_grads = py_utils.ComputeGradients(loss, vmap,
                                              p.grad_aggregation_method,
                                              p.colocate_gradients_with_ops,
                                              p.gate_gradients)

        # L2 regularizer.
        if p.l2_regularizer_weight is not None:
            l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l2_regularizer_weight, p=2.0)
            self._AddScalarSummary('l2_loss', l2_loss)

        # L1 regularizer.
        if p.l1_regularizer_weight is not None:
            l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss(
                var_grads, p.l1_regularizer_weight, p=1.0)
            self._AddScalarSummary('l1_loss', l1_loss)

        # Mask gradients only if the mask is set.
        if gradient_mask:
            var_grads = py_utils.MaskGradients(var_grads, gradient_mask)

        # Apply gradient clipping.
        scaled_vars = self.ScaleGradients(var_grads, gradient_adjuster)
        has_nan_or_inf = scaled_vars.has_nan_or_inf
        var_grads = scaled_vars.final_var_grads

        # Histogram summary.
        summary_utils.CollectVarHistogram(var_grads)
        self._var_grads = var_grads

        assert self.theta.global_step is not None, self.theta
        lrs = self.lr_schedule.Value(self.theta.global_step)
        self._AddScalarSummary('lr_schedule', lrs)
        lr = p.learning_rate * lrs

        var_update_op = self.optimizer.Apply(lr, var_grads)

        stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf,
                                   eval_metrics=self._eval_metrics)
        return var_update_op, stats
Exemplo n.º 16
0
 def ComputeGradients(self, loss, vmap, *args, **kwargs):
     """Allows subclasses control computation of gradients."""
     kwargs['use_bf16_gradients_ar'] = self.params.use_bf16_gradients_ar
     return py_utils.ComputeGradients(loss, vmap, *args, **kwargs)
Exemplo n.º 17
0
  def _BPropForVariables(self, vmap):
    """Constructs the backward graph for the given variables.

    Args:
      vmap: a `.NestedMap` of variables.
    """
    p = self.params
    tp = p.train

    # Compute gradients.
    self._var_grads = py_utils.ComputeGradients(self.loss, vmap)

    # L2 regularizer.
    if tp.l2_regularizer_weight is not None:
      l2_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss(
          self._var_grads, tp.l2_regularizer_weight, p=2.0)
      summary_utils.scalar(p, 'l2_loss', l2_loss)

    # L1 regularizer.
    if tp.l1_regularizer_weight is not None:
      l1_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss(
          self._var_grads, tp.l1_regularizer_weight, p=1.0)
      summary_utils.scalar(p, 'l1_loss', l1_loss)

    # Mask gradients only if the mask is set.
    if self._per_input_gradient_mask:
      bprop_onehot = self.input_generator.GetInputSourceOneHot()
      self._var_grads = py_utils.MaskGradients(
          self._var_grads, self._per_input_gradient_mask, bprop_onehot)

    # Apply gradient clipping.
    has_nan_or_inf, _, self._var_grads = self.ScaleGradients(self._var_grads)

    # Histogram summary.
    summary_utils.CollectVarHistogram(p, self._var_grads)

    lrs = self.lr_schedule.Value(self._global_step)
    summary_utils.scalar(p, 'lr_schedule', lrs)
    lr = tp.learning_rate * lrs

    var_update_op = self.optimizer.Apply(lr, self._var_grads)

    increment_global_step_ops = []
    with tf.colocate_with(self._shared_global_step):
      increment_global_step_ops.append(
          tf.assign_add(self._shared_global_step, 1))
    if self._task_global_step:
      with tf.colocate_with(self._task_global_step):
        increment_global_step_ops.append(
            tf.assign_add(self._task_global_step, 1))
    increment_global_steps = tf.group(*increment_global_step_ops)

    relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates(
        self.loss, tf.get_collection(py_utils.BATCH_NORM_UPDATES))
    batch_norm_updates = tf.group(*relevant_bn_updates)

    # Update stats.
    stats_updates = tf.group(
        self.IncrementTotalSamples(),
        self.IncrementTotalNans(tf.to_int32(has_nan_or_inf)))

    # Post training step update.
    post_training_step_updates = self.PostTrainingStepUpdate(self._global_step)

    # Get the op to update the weight masks and thresholds
    mask_update_op = self._GetMaskUpdateOp()

    # TODO(rpang): try to structure _train_op as:
    #   tf.cond(skip_step, <only update skip stats>, <all updates>)
    # so that we skip all other updates when a step is skipped.
    
    # 
    if p.contiguous:
        var_update_op = tf.group(var_update_op, self.last_state_group_op)

    self._train_op = tf.group(
        var_update_op,
        batch_norm_updates,
        stats_updates,
        post_training_step_updates,
        increment_global_steps,
        mask_update_op,
        name='train')
Exemplo n.º 18
0
 def ComputeGradients(self, loss, vmap, *args, **kwargs):
     """Allows subclasses control computation of gradients."""
     return py_utils.ComputeGradients(loss, vmap, *args, **kwargs)
Exemplo n.º 19
0
  def testAccumulator(self):
    # testAccumulator compares
    #   - explicit averaging of independently computed var_grads1 and
    #     var_grads2,
    #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
    np.random.seed(12345)
    np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
    np.random.seed(12346)
    np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

    g1 = tf.Graph()
    with g1.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
      op = optimizer.SGD.Params().Set(add_summary=False)
      opt = op.cls(op)
      lr = 1e-1
      with tf.control_dependencies([loss1, loss2]):
        var_update_op1 = opt.Apply(
            lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
        with tf.control_dependencies([var_update_op1]):
          var_update_op2 = opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))
      init_op = tf.global_variables_initializer()

    with self.session(use_gpu=True, graph=g1) as sess:
      sess.run(init_op)
      vars1 = sess.run(proj_layer.vars.Flatten())
      loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
          [loss1, var_grads1, loss2, var_grads2],
          feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      sess.run(
          [var_update_op2], feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      vars1_1 = sess.run(proj_layer.vars.Flatten())

    g2 = tf.Graph()
    with g2.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss = tf.reduce_sum(output1)
      var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
      op = optimizer.Accumulator.Params().Set(
          accum_steps=2,
          dtype=tf.float64,
          optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False))
      opt = op.cls(op)
      lr = 1e-1
      var_update_op = opt.Apply(lr, var_grads)
      init_op = tf.global_variables_initializer()
      global_step = py_utils.GetOrCreateGlobalStep()
      increment_global_step_op = tf.assign_add(global_step, 1)
    with self.session(use_gpu=True, graph=g2) as sess:
      sess.run(init_op)
      vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step])
      loss2_1, grads2_1 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input1,
          })
      loss2_2, grads2_2 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input2,
          })
      acc_0 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input1,
          })
      acc_1 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_intermediate = sess.run(proj_layer.vars.Flatten())
      sess.run(increment_global_step_op)
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input2,
          })
      acc_2 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_1 = sess.run(proj_layer.vars.Flatten())

    self.assertAllClose(vars1, vars2)

    self.assertAllClose(acc_0, np.zeros_like(acc_0))
    self.assertAllClose(acc_1, grads2_1['w'][1])
    self.assertAllClose(acc_2, np.zeros_like(acc_0))

    self.assertAllClose(loss1_1, loss2_1)
    self.assertAllClose(loss1_2, loss2_2)
    self.assertAllClose(grads1_1, grads2_1)
    self.assertAllClose(grads1_2, grads2_2)

    self.assertAllClose(vars1, vars2_intermediate)

    self.assertAllClose(vars2[0], grads2_1['w'][0])
    self.assertAllClose(vars2[0], grads2_2['w'][0])

    self.assertAllClose(
        vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0])

    self.assertAllClose(
        vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0])

    self.assertAllClose(vars2, vars2_intermediate)
    self.assertAllClose(vars1_1, vars2_1)