예제 #1
0
def inin_adam_states():
    v_w = nd.zeros((features.shape[1], 1))
    v_b = nd.zeros(1)
    s_w = nd.zeros((features.shape[1], 1))
    s_b = nd.zeros(1)
    return ((v_w, s_w), (v_b, s_b))


def adam(params, states, hyperparams):
    beta1, beta2, eps = 0.9, 0.999, 1e-6
    for p, (v, s) in zip(params, states):
        v[:] = beta1 * v + (1 - beta1) * p.grad
        s[:] = beta2 * s + (1 - beta2) * p.grad.square()
        corr_v = v / (1 - beta1**hyperparams['t'])
        corr_s = s / (1 - beta2**hyperparams['t'])
        g = (hyperparams['lr'] * corr_v) / (corr_s.sqrt() + eps)
        p[:] -= g
    hyperparams['t'] += 1


d2l.plt.figure(figsize=(15, 5))  # 设置图片大小
# init_momentum_states()不要忘记括号,可能只是执行这个函数,而不是传进去
d2l.train_ch7(adam, inin_adam_states(), {'lr': 0.01, 't': 1}, features, labels)

# TODO 简洁实现
d2l.plt.figure(figsize=(15, 5))  # 设置图片大小
# 参数是rho,没有learning_rate!!
d2l.train_gluon_ch7('adam', {'learning_rate': 0.01}, features, labels)

d2l.plt.show()
예제 #2
0
eta, gamma = 0.4, 0.9
d2l.show_trace_2d(f_2d, d2l.train_2d(rmsprop_2d))

#7.6.2-从零开始实现
features, labels = d2l.get_data_ch7()


def init_rmsprop_states():
    s_w = nd.zeros((features.shape[1], 1))
    s_b = nd.zeros(1)
    return (s_w, s_b)


def rmsprop(params, states, hyperparams):
    gamma, eps = hyperparams['gamma'], 1e-6
    for p, s in zip(params, states):
        s[:] = gamma * s + (1 - gamma) * p.grad.square()
        p[:] -= hyperparams['lr'] * p.grad / (s + eps).sqrt()


d2l.train_ch7(rmsprop, init_rmsprop_states(), {
    'lr': 0.01,
    'gamma': 0.9
}, features, labels)
#7.6.3-简洁实现
d2l.train_gluon_ch7('rmsprop', {
    'learning_rate': 0.01,
    'gamma1': 0.9
}, features, labels)
예제 #3
0
from mxnet import nd

features, labels = d2l.get_data_ch7()
def init_adadelta_states():
   s_w, s_b = nd.zeros((features.shape[1], 1)), nd.zeros(1)
   delta_w, delta_b = nd.zeros((features.shape[1], 1)), nd.zeros(1)
   return ((s_w, delta_w), (s_b, delta_b))
def adadelta(params, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-5
    for p, (s, delta) in zip(params, states):
        s[:] = rho * s + (1 - rho) * p.grad.square()
        g = ((delta+ eps).sqrt() / (s + eps).sqrt()) * p.grad
        p[:] -= g
        delta[:] = rho * delta + (1 - rho) * g * g

d2l.train_ch7(adadelta, init_adadelta_states(), {'rho':0.9}, features, labels)

#7.7.3-简洁实现
d2l.train_gluon_ch7('adadelta', {'rho': 0.9}, features, labels)

#########################################################################################
#7.8-Adam算法
#Adam算法在RMSProp算法基础上对小批量随机梯度也做了指数加权移动平均
#7.8.1-算法
#7.8.2-从零开始实现
#%matplotlib inline
import d2lzh as d2l
from mxnet import nd

features, labels = d2l.get_data_ch7()
def init_adam_states():
예제 #4
0
# 手动写一个
features, labels = d2l.get_data_ch7()
def inin_adagrad_states():
    s_w = nd.zeros((features.shape[1], 1))
    s_b = nd.zeros(1)
    return (s_w, s_b)

def adagrad(params, states, hyperparams):
    eps = 1e-6
    for p,s in zip(params, states):
        s[:] += p.grad.square()
        p[:] -= hyperparams['lr'] * p.grad / (s+eps).sqrt()


d2l.plt.figure(figsize=(15,5))                 # 设置图片大小
# init_momentum_states()不要忘记括号,可能只是执行这个函数,而不是传进去
# 可以认为批量增加多少,学习率减少多少,如momentum从0.5到0.9,即批量从2到10,增加5倍,学习率减少1/5,不然曲线不光滑
d2l.train_ch7(adagrad, inin_adagrad_states(), {'lr':0.1}, features, labels)



# TODO 简洁实现
d2l.plt.figure(figsize=(15,5))                 # 设置图片大小7
d2l.train_gluon_ch7('adagrad', {'learning_rate':0.1}, features, labels)


d2l.plt.show()


예제 #5
0
    return x1-v1, x2-v2, v1, v2

eta, gamma = 0.4, 0.5
d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d))

eta =0.6
d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d))
#指数加权移动平均-推导过程
#由指数加权移动平均理解动量法
#7.4.3-从零开始实现
features, labels = d2l.get_data_ch7()
def init_momentum_states():
    v_w = nd.zeros((features.shape[1], 1))
    v_b = nd.zeros(1)
    return (v_w, v_b)

def sgd_momentum(params, states, hyperparams):
    for p, v in zip(params, states):
        v[:] = hyperparams['momentum']*v + hyperparams['lr'] * p.grad
        p[:] -= v

d2l.train_ch7(sgd_momentum, init_momentum_states(),
              {'lr':0.02, 'momentum':0.5}, features, labels)
d2l.train_ch7(sgd_momentum, init_momentum_states(),
              {'lr':0.02, 'momentum':0.9}, features, labels)
d2l.train_ch7(sgd_momentum, init_momentum_states(),
              {'lr':0.004, 'momentum':0.5}, features, labels)
#7.4.4-简洁实现
d2l.train_gluon_ch7('sgd', {'learning_rate':0.004, 'momentum':0.9}, features, labels)

예제 #6
0
from mxnet import nd

features, labels = d2l.get_data_ch7()


def init_adadelta_states():
    s_w, s_b = nd.zeros((features.shape[1], 1)), nd.zeros(1)
    delta_w, delta_b = nd.zeros((features.shape[1], 1)), nd.zeros(1)
    return ((s_w, delta_w), (s_b, delta_b))


def adadelta(params, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-5
    for p, (s, delta) in zip(params, states):
        s[:] = rho * s + (1 - rho) * p.grad.square()
        g = ((delta + eps).sqrt() / (s + eps).sqrt()) * p.grad
        p[:] -= g
        delta[:] = rho * delta + (1 - rho) * g * g


# In[4]:

d2l.train_ch7(adadelta, init_adadelta_states(), {'rho': 0.9}, features, labels)

# In[6]:

#简洁实现
d2l.train_gluon_ch7('adadelta', {'rho': 0.9}, features, labels)

# In[ ]: