def testZeroGradientNoOpAtFirstStep(self): """Test that checks that epsilon handling is unncessary.""" with self.cached_session() as sess: var = tf.Variable(0.5) grad = tf.Variable(0.0) opt = sm3.SM3Optimizer(learning_rate=self._learning_rate, momentum=self._momentum) step = opt.apply_gradients([(grad, var)]) sess.run(tf.global_variables_initializer()) # Check that variable and momentum are as expected before starting # training. var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(0.5, var_np) self.assertAllClose(0.0, gbar_np) # Run one step of training. step.run() var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(0.5, var_np) self.assertAllClose(0.0, gbar_np)
def testDenseLayerMatrix(self): """SM3 update with gbar, and epsilon.""" with self.cached_session() as sess: var = tf.Variable([[0.5, 0.5], [0.5, 0.5]]) grad = tf.Variable([[0.1, 0.1], [0.01, 0.01]]) opt = sm3.SM3Optimizer(learning_rate=0.1, momentum=0.9) step = opt.apply_gradients([(grad, var)]) tf.global_variables_initializer().run() pre_var = sess.run(var) pre_gbar = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose([[0.5, 0.5], [0.5, 0.5]], pre_var) self.assertAllClose([[0.0, 0.0], [0.0, 0.0]], pre_gbar) step.run() pre_var = sess.run(var) pre_gbar = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose([[0.49, 0.49], [0.49, 0.49]], pre_var) self.assertAllClose([[0.1, 0.1], [0.1, 0.1]], pre_gbar) step.run() pre_var = sess.run(var) pre_gbar = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose([[0.4739, 0.4739], [0.4739, 0.4739]], pre_var, atol=1e-4) self.assertAllClose([[0.16, 0.16], [0.16, 0.16]], pre_gbar, atol=1e-2)
def testDenseLayerMatrix(self): """Test a single dense matrix layer.""" with self.cached_session() as sess: var = tf.Variable([[0.5, 0.5], [0.5, 0.5]]) grad_np = [[0.1, 0.05], [0.03, 0.02]] grad = tf.Variable(grad_np) opt = sm3.SM3Optimizer(learning_rate=self._learning_rate, momentum=self._momentum) step = opt.apply_gradients([(grad, var)]) sess.run(tf.global_variables_initializer()) # Check that variable and momentum are as expected before starting # training. var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(var_np, [[0.5, 0.5], [0.5, 0.5]]) self.assertAllClose([[0.0, 0.0], [0.0, 0.0]], gbar_np) row_accumulator = numpy.zeros([2, 1]) col_accumulator = numpy.zeros([1, 2]) accumulator = numpy.zeros_like(gbar_np) for _ in range(2): # Run a step of training. step.run() accumulator = numpy.minimum(row_accumulator, col_accumulator) # Expected preconditioned gradient, momentum, and parameter. accumulator += numpy.square(grad_np) # Update SM3 accumulators. row_accumulator = numpy.amax(accumulator, axis=1, keepdims=True) col_accumulator = numpy.amax(accumulator, axis=0, keepdims=True) exp_p_grad = grad_np / numpy.sqrt(accumulator) exp_gbar_np = (self._momentum * gbar_np + (1 - self._momentum) * exp_p_grad) exp_var = var_np - self._learning_rate * exp_gbar_np # Check that variable and momentum are as expected after one step of # training. var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(exp_var, var_np) self.assertAllClose(exp_gbar_np, gbar_np)
def testSparseUpdates(self): """SM3 sparse updates.""" with self.cached_session() as sess: var = tf.Variable([[0.5], [0.5], [0.5], [0.5]]) grad = tf.IndexedSlices(tf.constant([0.1, 0.1], shape=[2, 1]), tf.constant([1, 3]), tf.constant([2, 1])) opt = sm3.SM3Optimizer(learning_rate=0.1, momentum=0.9) step = opt.apply_gradients([(grad, var)]) tf.global_variables_initializer().run() pre_var = sess.run(var) self.assertAllClose([[0.5], [0.5], [0.5], [0.5]], pre_var) step.run() pre_var = sess.run(var) self.assertAllClose([[0.5], [0.4], [0.5], [0.4]], pre_var)
def testNoEpsilon(self): """SM3 update without epsilon.""" with self.cached_session() as sess: var = tf.Variable(0.5) grad = tf.Variable(0.0) opt = sm3.SM3Optimizer(learning_rate=0.1, momentum=0.9) step = opt.apply_gradients([(grad, var)]) tf.global_variables_initializer().run() pre_var = sess.run(var) pre_gbar = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(0.5, pre_var) self.assertAllClose(0.0, pre_gbar) step.run() pre_var = sess.run(var) pre_gbar = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(0.5, pre_var) self.assertAllClose(0.0, pre_gbar)
def testDenseVectorLayer(self): """Test a single dense vector layer.""" with self.cached_session() as sess: var = tf.Variable([0.5, 0.3]) grad_np = [0.1, 0.1] grad = tf.Variable(grad_np) opt = sm3.SM3Optimizer(learning_rate=self._learning_rate, momentum=self._momentum) step = opt.apply_gradients([(grad, var)]) sess.run(tf.global_variables_initializer()) # Check that variable and momentum are as expected before starting # training. var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose([0.5, 0.3], var_np) self.assertAllClose([0.0, 0.0], gbar_np) accumulator = numpy.zeros_like(gbar_np) for _ in range(2): # Run a step of training. step.run() # Expected preconditioned gradient, momentum, and parameter. accumulator += numpy.square(grad_np) exp_p_grad = grad_np / numpy.sqrt(accumulator) exp_gbar_np = (self._momentum * gbar_np + (1 - self._momentum) * exp_p_grad) exp_var = var_np - self._learning_rate * exp_gbar_np # Check that variable and momentum are as expected after one step of # training. var_np = sess.run(var) gbar_np = sess.run(opt.get_slot(var, 'momentum')) self.assertAllClose(exp_var, var_np) self.assertAllClose(exp_gbar_np, gbar_np)
def testSparseUpdates(self): """Test that checks sparse updates.""" with self.cached_session() as sess: var = tf.Variable([[0.5, 0.05], [0.05, 1.0], [0.15, 3.0], [0.35, 2.0]]) # A sparse gradient that updates index 1, and 3. grad_np = [[0.1, 0.05], [0.01, 1.5]] indices_np = [1, 3] shape = [2, 2] grad = tf.IndexedSlices( tf.constant(grad_np, shape=shape), tf.constant(indices_np), # indices tf.constant(shape)) # shape opt = sm3.SM3Optimizer(learning_rate=self._learning_rate, momentum=self._momentum) step = opt.apply_gradients([(grad, var)]) sess.run(tf.global_variables_initializer()) # Check that variable and momentum are as expected before starting # training. var_np = sess.run(var) self.assertAllClose( [[0.5, 0.05], [0.05, 1.0], [0.15, 3.0], [0.35, 2.0]], var_np) # Run one step of training. step.run() accumulator = numpy.zeros_like(var_np) accumulator[indices_np, :] += numpy.square(grad_np) row_accumulator = numpy.amax(accumulator, axis=1, keepdims=True) # Update SM3 accumulators. exp_p_grad = grad_np / numpy.sqrt(accumulator[indices_np, :]) exp_var_np = var_np exp_var_np[indices_np, :] = var_np[ indices_np, :] - self._learning_rate * exp_p_grad var_np = sess.run(var) self.assertAllClose(exp_var_np, var_np) row_accumulator_var = numpy.reshape( sess.run(opt.get_slot(var, 'accumulator_0')), [4, 1]) self.assertAllClose(row_accumulator_var, row_accumulator)