예제 #1
0
def test_sgd_momentum_static():
    _, data_shape, _, label_shape = get_input()
    mlp = MLP()
    opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)

    @trace
    def f(data, label):
        pred = mlp(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt.zero_grad()
        opt.backward(loss)

    slots = TensorDict()
    for param in mlp.parameters():
        slots[param] = np.zeros(param.shape).astype(np.float32)
    for _ in range(3):
        f(
            np.random.random(data_shape).astype(np.float32),
            np.random.randint(0, 10, label_shape).astype(np.int32),
        )
        orig_params = TensorDict()
        grads = TensorDict()
        for param in mlp.parameters():
            orig_params[param] = np.copy(param.numpy())
            grads[param] = np.copy(param.grad.numpy())
        opt.step()
        for param in mlp.parameters():
            slot = slots[param]
            orig_param = orig_params[param]
            slot *= 0.9
            slot -= param.grad.numpy() * 0.01
            assertTensorClose(param.numpy(), orig_param + slot)
예제 #2
0
def test_sgd_simple():
    data, data_shape, label, label_shape = get_input()
    mlp = MLP()
    opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1)
    for idx in range(3):
        data.set_value(np.random.random(data_shape).astype(np.float32))
        label.set_value(np.random.randint(0, 10, label_shape))
        pred = mlp(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        if idx % 2:
            opt.zero_grad()
        else:
            mlp.zero_grad()
        opt.backward(loss)
        grads = TensorDict()
        orig_params = TensorDict()
        for param in mlp.parameters():
            grad = F.grad(loss, param, use_virtual_grad=False)
            assertTensorClose(grad.numpy(), param.grad.numpy())
            grads[param] = np.copy(grad.numpy())
            orig_params[param] = np.copy(param.numpy())
        opt.step()
        for param in mlp.parameters():
            assertTensorClose(param.numpy(),
                              orig_params[param] * 0.999 - grads[param] * 0.01)
예제 #3
0
 def __init__(self, net, **kwarg):
     self.s_slots = TensorDict()
     self.a_slots = TensorDict()
     for param in net.parameters():
         self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
         self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
     for k, v in kwarg.items():
         setattr(self, k, v)
예제 #4
0
def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
    iter_num = 3
    data, data_shape, label, label_shape = get_input()

    net = MLP()
    opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
    check_func = check_class(net, **test_case)

    step = 0

    # eager graph
    for i in range(iter_num):
        if update_lr and i == 1:  # change learning rate
            for group in opt.param_groups:
                group["lr"] += 0.01
            check_func.lr += 0.01
        data.set_value(np.random.random(data_shape).astype(np.float32))
        label.set_value(np.random.randint(0, 10, label_shape))
        pred = net(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt.zero_grad()
        opt.backward(loss)
        ori_params = TensorDict()
        for param in net.parameters():
            ori_params[param] = np.copy(param.numpy())
        opt.step()
        step += 1
        check_func(ori_params, net.parameters(), step)

    # static graph
    @trace
    def train_func(data, label):
        pred = net(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt.backward(loss)

    for i in range(iter_num):
        if update_lr and i == 1:  # change learning rate
            for group in opt.param_groups:
                group["lr"] += 0.01
            check_func.lr += 0.01
        opt.zero_grad()
        ori_params = TensorDict()
        for param in net.parameters():
            ori_params[param] = np.copy(param.numpy())
        train_func(
            np.random.random(data_shape).astype(np.float32),
            np.random.randint(0, 10, label_shape).astype(np.int32),
        )
        opt.step()
        step += 1
        check_func(ori_params, net.parameters(), step)
예제 #5
0
def test_optimizer_serialization():
    data, data_shape, label, label_shape = get_input()
    mlp = MLP()
    opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9)
    slots = TensorDict()
    for param in mlp.parameters():
        slots[param] = np.zeros(param.shape).astype(np.float32)

    pred = mlp(data)
    loss = F.square_loss(pred, label.reshape(-1, 1))
    opt.zero_grad()
    opt.backward(loss)
    opt.step()
    for param in mlp.parameters():
        slot = slots[param]
        slot *= 0.9
        slot -= param.grad.numpy() * 0.01

    with BytesIO() as fout:
        save(opt.state_dict(), fout)
        fout.seek(0)
        state_dict = load(fout)
        opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8)
        opt1.load_state_dict(state_dict)

        data.set_value(np.random.random(data_shape).astype(np.float32))
        label.set_value(np.random.randint(0, 10, label_shape))
        pred = mlp(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt1.zero_grad()
        opt1.backward(loss)
        orig_params = TensorDict()
        for param in mlp.parameters():
            orig_params[param] = np.copy(param.numpy())
        opt1.step()
        for param in mlp.parameters():
            orig_param = orig_params[param]
            slot = slots[param]
            slot *= 0.9
            slot -= param.grad.numpy() * 0.01
            assertTensorClose(param.numpy(), orig_param + slot)
예제 #6
0
def test_adam():
    data, data_shape, label, label_shape = get_input()
    mlp = MLP()
    beta0 = 0.8
    beta1 = 0.9
    eps = 1e-4
    opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps)
    m_slots = TensorDict()
    v_slots = TensorDict()
    for param in mlp.parameters():
        m_slots[param] = np.zeros(param.shape).astype(np.float32)
        v_slots[param] = np.zeros(param.shape).astype(np.float32)
    step_size = 0

    def check_value():
        for param in mlp.parameters():
            grad = param.grad.numpy()
            orig_param = orig_params[param]
            m = m_slots[param]
            v = v_slots[param]
            m *= beta0
            m += (1 - beta0) * grad
            v *= beta1
            v += (1 - beta1) * grad * grad
            update = (m / (1 - beta0**step_size)) / (
                np.sqrt(v / (1 - beta1**step_size)) + eps)
            assertTensorClose(param.numpy(), orig_param - 0.01 * update)

    # eager
    for _ in range(3):
        data.set_value(np.random.random(data_shape).astype(np.float32))
        label.set_value(np.random.randint(0, 10, label_shape))
        pred = mlp(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt.zero_grad()
        grads = opt.backward(loss)
        orig_params = TensorDict()
        for param in mlp.parameters():
            orig_params[param] = np.copy(param.numpy())
        opt.step()
        step_size += 1
        check_value()

    # static
    @trace
    def f(data, label):
        pred = mlp(data)
        loss = F.square_loss(pred, label.reshape(-1, 1))
        opt.backward(loss)

    for _ in range(3):
        opt.zero_grad()
        orig_params = TensorDict()
        for param in mlp.parameters():
            orig_params[param] = np.copy(param.numpy())
        f(
            np.random.random(data_shape).astype(np.float32),
            np.random.randint(0, 10, label_shape).astype(np.int32),
        )
        opt.step()
        step_size += 1
        check_value()