예제 #1
0
def init_momentum_states():
    v_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
    v_b = torch.zeros(1, dtype=torch.float32)
    return (v_w, v_b)


def sgd_momentum(params, states, hyperparams):
    for p, v in zip(params, states):
        v.data = hyperparams['momentum'] * v.data + hyperparams[
            'lr'] * p.grad.data
        p.data -= v.data


d2l.train_ch7(sgd_momentum, init_momentum_states(), {
    'lr': 0.02,
    'momentum': 0.5
}, features, labels)

d2l.train_ch7(sgd_momentum, init_momentum_states(), {
    'lr': 0.02,
    'momentum': 0.9
}, features, labels)

d2l.train_ch7(sgd_momentum, init_momentum_states(), {
    'lr': 0.04,
    'momentum': 0.5
}, features, labels)

# 4 简洁实现

d2l.train_pytorch_ch7(torch.optim.SGD, {
예제 #2
0
d2l.show_trace_2d(f_2d, d2l.train_2d(rmsprop_2d))

features, labels = d2l.get_data_ch7()


# 2 从零开始实现
def init_rmsprop_states():
    s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
    s_b = torch.zeros(1, dtype=torch.float32)
    return (s_w, s_b)


def rmsprop(params, states, hyperparams):
    gamma, eps = hyperparams['gamma'], 1e-6
    for p, s in zip(params, states):
        s.data = gamma * s.data + (1 - gamma) * (p.grad.data)**2
        p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps)


#3 简洁实现
d2l.train_ch7(rmsprop, init_rmsprop_states(), {
    'lr': 0.01,
    'gamma': 0.9
}, features, labels)

d2l.train_pytorch_ch7(torch.optim.RMSprop, {
    'lr': 0.01,
    'alpha': 0.9
}, features, labels)
예제 #3
0
def f_2d(x1, x2):
    return 0.1 * x1**2 + 2 * x2**2


eta = 0.4
d2l.show_trace_2d(f_2d, d2l.train_2d(adagrad_2d))

eta = 2
d2l.show_trace_2d(f_2d, d2l.train_2d(adagrad_2d))
# 3 从零开始实现
features, labels = d2l.get_data_ch7()


def init_adagrad_states():
    s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
    s_b = torch.zeros(1, dtype=torch.float32)
    return (s_w, s_b)


def adagrad(params, states, hyperparams):
    eps = 1e-6
    for p, s in zip(params, states):
        s.data += (p.grad.data**2)
        p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps)


d2l.train_ch7(adagrad, init_adagrad_states(), {'lr': 0.1}, features, labels)

d2l.train_pytorch_ch7(torch.optim.Adagrad, {'lr': 0.1}, features, labels)
예제 #4
0
def train_sgd(lr, batch_size, num_epochs=2):
    d2l.train_ch7(sgd, None, {'lr': lr}, features, labels, batch_size,
                  num_epochs)
예제 #5
0
import torch
import d2l_pytorch.d2l as d2l

# 2 从零开始实现
features, labels = d2l.get_data_ch7()


def init_adadelta_states():
    s_w, s_b = torch.zeros(
        (features.shape[1], 1),
        dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
    delta_w, delta_b = torch.zeros(
        (features.shape[1], 1),
        dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
    return ((s_w, delta_w), (s_b, delta_b))


def adadelta(params, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-5
    for p, (s, delta) in zip(params, states):
        s[:] = rho * s + (1 - rho) * (p.grad.data**2)
        g = p.grad.data * torch.sqrt((delta + eps) / (s + eps))
        p.data -= g
        delta[:] = rho * delta + (1 - rho) * g * g


# 3 简洁实现
d2l.train_ch7(adadelta, init_adadelta_states(), {'rho': 0.9}, features, labels)

d2l.train_pytorch_ch7(torch.optim.Adadelta, {'rho': 0.9}, features, labels)