def test_sgd_momentum_static(): _, data_shape, _, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) @trace def f(data, label): pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) for _ in range(3): f( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) orig_params = TensorDict() grads = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) grads[param] = np.copy(param.grad.numpy()) opt.step() for param in mlp.parameters(): slot = slots[param] orig_param = orig_params[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 assertTensorClose(param.numpy(), orig_param + slot)
def test_update_lr(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for group in opt.param_groups: group["lr"] += 0.02 for _ in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) orig_params = [] for param in mlp.parameters(): orig_params.append(np.copy(param.numpy())) opt.step() for param, orig_param in zip(mlp.parameters(), orig_params): assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03)
def test_sgd_simple(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1) for idx in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) if idx % 2: opt.zero_grad() else: mlp.zero_grad() opt.backward(loss) grads = TensorDict() orig_params = TensorDict() for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) grads[param] = np.copy(grad.numpy()) orig_params[param] = np.copy(param.numpy()) opt.step() for param in mlp.parameters(): assertTensorClose(param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01)
def _test_optimizer(opt_str, test_case, check_class, update_lr=False): iter_num = 3 data, data_shape, label, label_shape = get_input() net = MLP() opt = getattr(optimizer, opt_str)(net.parameters(), **test_case) check_func = check_class(net, **test_case) step = 0 # eager graph for i in range(iter_num): if update_lr and i == 1: # change learning rate for group in opt.param_groups: group["lr"] += 0.01 check_func.lr += 0.01 data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) ori_params = TensorDict() for param in net.parameters(): ori_params[param] = np.copy(param.numpy()) opt.step() step += 1 check_func(ori_params, net.parameters(), step) # static graph @trace def train_func(data, label): pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.backward(loss) for i in range(iter_num): if update_lr and i == 1: # change learning rate for group in opt.param_groups: group["lr"] += 0.01 check_func.lr += 0.01 opt.zero_grad() ori_params = TensorDict() for param in net.parameters(): ori_params[param] = np.copy(param.numpy()) train_func( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) opt.step() step += 1 check_func(ori_params, net.parameters(), step)
def test_compile_multi_times_static(): return # XXX: rewrite or remove this test with Graph() as cg: cg.set_option("eager_evaluation", False) data = Input("data", shape=(2, 28)) label = Input("label", shape=(2, ), dtype=np.int32) mlp = MLP() opt = SGD(mlp.parameters(requires_grad=True), lr=0.01) pred0 = mlp(data) pred = F.softmax(pred0) loss = F.square_loss(pred, label.reshape(2, 1)) opt.zero_grad() grads = opt.backward(loss) opt.step() f0 = compile(pred, None) f1 = compile([pred, loss], grads, copy=True) data = np.random.random((2, 28)).astype(np.float32) label = np.random.randint(0, 10, (2, )).astype(np.float32) out0 = f0(data=data) out1 = f1(data=data, label=label) assertTensorClose(out0[0], out1[0]) _ = compile([pred, loss], grads, copy=False) with pytest.raises(mgb.MegBrainError): f0(data=data)
def test_multi_step_lr(): mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) scheduler = MultiStepLR(opt, [3, 6, 8]) lr = np.array(0.01, dtype=np.float32) for i in range(10): for group in opt.param_groups: assertTensorClose( np.array(group["lr"], dtype=np.float32), (lr * 0.1**bisect_right([3, 6, 8], i)).astype(np.float32), max_err=5e-6, ) scheduler.step()
def test_compile_multi_times_eager(): return # XXX: rewrite or remove this test data = Input("data", shape=(2, 28)) label = Input("label", shape=(2, ), dtype=np.int32) mlp = MLP() opt = SGD(mlp.parameters(requires_grad=True), lr=0.01) pred0 = mlp(data) pred = F.softmax(pred0) loss = F.square_loss(pred, label.reshape(2, 1)) opt.zero_grad() grads = opt.backward(loss) opt.step() f0 = compile(pred, None) f1 = compile([pred, loss], grads, copy=False) for _ in range(3): data = np.random.random((2, 28)).astype(np.float32) label = np.random.randint(0, 10, (2, )).astype(np.float32) out0 = f0(data=data) out1 = f1(data=data, label=label) assertTensorClose(out0[0], out1[0])
def test_optimizer_serialization(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for param in mlp.parameters(): slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 with BytesIO() as fout: save(opt.state_dict(), fout) fout.seek(0) state_dict = load(fout) opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8) opt1.load_state_dict(state_dict) data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt1.zero_grad() opt1.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt1.step() for param in mlp.parameters(): orig_param = orig_params[param] slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 assertTensorClose(param.numpy(), orig_param + slot)
def test_adam(): data, data_shape, label, label_shape = get_input() mlp = MLP() beta0 = 0.8 beta1 = 0.9 eps = 1e-4 opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps) m_slots = TensorDict() v_slots = TensorDict() for param in mlp.parameters(): m_slots[param] = np.zeros(param.shape).astype(np.float32) v_slots[param] = np.zeros(param.shape).astype(np.float32) step_size = 0 def check_value(): for param in mlp.parameters(): grad = param.grad.numpy() orig_param = orig_params[param] m = m_slots[param] v = v_slots[param] m *= beta0 m += (1 - beta0) * grad v *= beta1 v += (1 - beta1) * grad * grad update = (m / (1 - beta0**step_size)) / ( np.sqrt(v / (1 - beta1**step_size)) + eps) assertTensorClose(param.numpy(), orig_param - 0.01 * update) # eager for _ in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() grads = opt.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt.step() step_size += 1 check_value() # static @trace def f(data, label): pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.backward(loss) for _ in range(3): opt.zero_grad() orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) f( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) opt.step() step_size += 1 check_value()