def test_load(self): mul_out, b1_out, b2_out, mean_out = self.net() sgd_optimizer = optimizer.SGD(learning_rate=1.0) recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) recompute_optimizer._set_checkpoints([b1_out]) try: stat_dict = {} recompute_optimizer.load(stat_dict) except NotImplementedError as e: self.assertEqual( "load function is not supported by Recompute Optimizer for now", cpt.get_exception_message(e))
def test_adjacent_checkpoint(self): mul_out, b1_out, b2_out, mean_out = self.net() self.assertEqual(len(mean_out.block.ops), 4) self.assertEqual([op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"]) sgd_optimizer = optimizer.SGD(learning_rate=1.0) recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) recompute_optimizer._set_checkpoints([mul_out, b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) self.assertEqual(len(mean_out.block.ops), 12) self.assertEqual([op.type for op in mean_out.block.ops], [ "mul", "elementwise_add", "elementwise_add", "mean", "fill_constant", "mean_grad", "elementwise_add_grad", "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd" ])
def test_dropout(self): """ If there are dropout layers in the forward nets, we should add a seed op """ mul_out, b1_out, b2_out, mean_out = self.net(with_dropout=True) self.assertEqual(len(mean_out.block.ops), 5) self.assertEqual( [op.type for op in mean_out.block.ops], ["mul", "dropout", "elementwise_add", "elementwise_add", "mean"]) sgd_optimizer = optimizer.SGD(learning_rate=1.0) recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) recompute_optimizer._set_checkpoints([b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) self.assertEqual(len(mean_out.block.ops), 17) self.assertEqual([op.type for op in mean_out.block.ops], [ "mul", "seed", "dropout", "elementwise_add", "elementwise_add", "mean", "fill_constant", "mean_grad", "elementwise_add_grad", "mul", "dropout", "elementwise_add_grad", "dropout_grad", "mul_grad", "sgd", "sgd", "sgd" ])
def test_apply_gradients(self): mul_out, b1_out, b2_out, mean_out = self.net() sgd_optimizer = optimizer.SGD(learning_rate=1.0) recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) recompute_optimizer._set_checkpoints([b1_out]) # apply backward params_grads = recompute_optimizer.backward(mean_out, startup_program=None, parameter_list=None, no_grad_set=None) # apply gradient program = mean_out.block.program with framework.program_guard(program, None): optimize_ops = recompute_optimizer.apply_gradients(params_grads) self.assertEqual(len(mean_out.block.ops), 13) self.assertEqual([op.type for op in mean_out.block.ops], [ "mul", "elementwise_add", "elementwise_add", "mean", "fill_constant", "mean_grad", "elementwise_add_grad", "mul", "elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd" ])
def check_dgc_momentum_optimizer(self, dims=[5, 10, 8], name="momentum", regularization=None, use_recompute=False): init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( dtype="float32", shape=[dims[0], dims[1]], lod_level=0, name="mul.x", optimize_attr={'learning_rate': 1.1}, regularizer=None if regularization is not None else regularizer.L2DecayRegularizer(2e-4)) mul_y = block.create_var(dtype="float32", shape=[dims[1], dims[2]], lod_level=0, name="mul.y") mul_out = block.create_var(dtype="float32", shape=[dims[0], dims[2]], lod_level=0, name="mul.out") block.append_op(type="mul", inputs={ "X": mul_x, "Y": mul_y }, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) learning_rate = 0.01 dgc_momentum_optimizer = self.MockDGCMomentum( learning_rate=learning_rate, momentum=0.2, rampup_begin_step=0, num_trainers=2, regularization=regularization, grad_clip=clip.GradientClipByNorm(1.0)) if use_recompute: dgc_momentum_optimizer = optimizer.RecomputeOptimizer( dgc_momentum_optimizer) dgc_momentum_optimizer._set_checkpoints([]) dgc_momentum_optimizer.get_accumulators = dgc_momentum_optimizer._optimizer.get_accumulators dgc_momentum_optimizer.get_velocity_str = dgc_momentum_optimizer._optimizer.get_velocity_str mean_out = block.create_var(dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op(type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) # params_grads = append_backward(mean_out) params_grads = dgc_momentum_optimizer.backward(mean_out) accumulator_count = 1 if name == "momentum" else 2 self.assertEqual(len(params_grads), 1) self.assertEqual(len(dgc_momentum_optimizer.get_accumulators()), accumulator_count) with framework.program_guard(program, init_program): opts = dgc_momentum_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 2) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], ["scale", name]) self.assertFalse(sgd_op.attr('use_nesterov')) # Check accumulators accumulators = dgc_momentum_optimizer.get_accumulators() self.assertEqual(len(accumulators), accumulator_count) self.assertTrue( dgc_momentum_optimizer.get_velocity_str() in accumulators) velocity_acc = accumulators[dgc_momentum_optimizer.get_velocity_str()] self.assertEqual(len(velocity_acc), 1) self.assertTrue(mul_x.name in velocity_acc) # Check init_program init_ops = init_program.global_block().ops self.assertEqual(len(init_ops), 1) self.assertEqual(init_ops[0].type, "fill_constant") self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) # check dgc op regularization coeff train_ops = program.global_block().ops for op in train_ops: if op.type == "dgc": coeff = 2e-4 if regularization is None else 1e-4 self.assertAlmostEqual(op.attr('regular_coeff'), coeff) print("dgc regular_coeff=" + str(coeff))