def init_momentum_states(): v_w = torch.zeros((features.shape[1], 1), dtype=torch.float32) v_b = torch.zeros(1, dtype=torch.float32) return (v_w, v_b) def sgd_momentum(params, states, hyperparams): for p, v in zip(params, states): v.data = hyperparams['momentum'] * v.data + hyperparams[ 'lr'] * p.grad.data p.data -= v.data d2l.train_ch7(sgd_momentum, init_momentum_states(), { 'lr': 0.02, 'momentum': 0.5 }, features, labels) d2l.train_ch7(sgd_momentum, init_momentum_states(), { 'lr': 0.02, 'momentum': 0.9 }, features, labels) d2l.train_ch7(sgd_momentum, init_momentum_states(), { 'lr': 0.04, 'momentum': 0.5 }, features, labels) # 4 简洁实现 d2l.train_pytorch_ch7(torch.optim.SGD, {
d2l.show_trace_2d(f_2d, d2l.train_2d(rmsprop_2d)) features, labels = d2l.get_data_ch7() # 2 从零开始实现 def init_rmsprop_states(): s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32) s_b = torch.zeros(1, dtype=torch.float32) return (s_w, s_b) def rmsprop(params, states, hyperparams): gamma, eps = hyperparams['gamma'], 1e-6 for p, s in zip(params, states): s.data = gamma * s.data + (1 - gamma) * (p.grad.data)**2 p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps) #3 简洁实现 d2l.train_ch7(rmsprop, init_rmsprop_states(), { 'lr': 0.01, 'gamma': 0.9 }, features, labels) d2l.train_pytorch_ch7(torch.optim.RMSprop, { 'lr': 0.01, 'alpha': 0.9 }, features, labels)
def f_2d(x1, x2): return 0.1 * x1**2 + 2 * x2**2 eta = 0.4 d2l.show_trace_2d(f_2d, d2l.train_2d(adagrad_2d)) eta = 2 d2l.show_trace_2d(f_2d, d2l.train_2d(adagrad_2d)) # 3 从零开始实现 features, labels = d2l.get_data_ch7() def init_adagrad_states(): s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32) s_b = torch.zeros(1, dtype=torch.float32) return (s_w, s_b) def adagrad(params, states, hyperparams): eps = 1e-6 for p, s in zip(params, states): s.data += (p.grad.data**2) p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps) d2l.train_ch7(adagrad, init_adagrad_states(), {'lr': 0.1}, features, labels) d2l.train_pytorch_ch7(torch.optim.Adagrad, {'lr': 0.1}, features, labels)
def train_sgd(lr, batch_size, num_epochs=2): d2l.train_ch7(sgd, None, {'lr': lr}, features, labels, batch_size, num_epochs)
import torch import d2l_pytorch.d2l as d2l # 2 从零开始实现 features, labels = d2l.get_data_ch7() def init_adadelta_states(): s_w, s_b = torch.zeros( (features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32) delta_w, delta_b = torch.zeros( (features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, states, hyperparams): rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta) in zip(params, states): s[:] = rho * s + (1 - rho) * (p.grad.data**2) g = p.grad.data * torch.sqrt((delta + eps) / (s + eps)) p.data -= g delta[:] = rho * delta + (1 - rho) * g * g # 3 简洁实现 d2l.train_ch7(adadelta, init_adadelta_states(), {'rho': 0.9}, features, labels) d2l.train_pytorch_ch7(torch.optim.Adadelta, {'rho': 0.9}, features, labels)