示例#1
0
 def __init__(self, input_width, state_width, learning_rate):
     self.input_width = input_width
     self.state_width = state_width
     self.learning_rate = learning_rate
     # 门的激活函数
     self.gate_activator = SigmoidActivator()
     # 输出的激活函数
     self.output_activator = TanhActivator()
     # 当前时刻初始化为t0
     self.times = 0
     # 各个时刻的单元状态向量c
     self.c_list = self.init_state_vec()
     # 各个时刻的输出向量h
     self.h_list = self.init_state_vec()
     # 各个时刻的遗忘门f
     self.f_list = self.init_state_vec()
     # 各个时刻的输入门i
     self.i_list = self.init_state_vec()
     # 各个时刻的输出门o
     self.o_list = self.init_state_vec()
     # 各个时刻的即时状态c~
     self.ct_list = self.init_state_vec()
     # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
     # 输入门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wih, self.Wix, self.bi = (self.init_weight_mat())
     # 输出门权重矩阵Wfh, Wfx, 偏置项bf
     self.Woh, self.Wox, self.bo = (self.init_weight_mat())
     # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
     self.Wch, self.Wcx, self.bc = (self.init_weight_mat())
示例#2
0
文件: lstm.py 项目: ccgcyber/learn_dl
 def __init__(self, input_width, state_width, 
              learning_rate):
     self.input_width = input_width
     self.state_width = state_width
     self.learning_rate = learning_rate
     # 门的激活函数
     self.gate_activator = SigmoidActivator()
     # 输出的激活函数
     self.output_activator = TanhActivator()
     # 当前时刻初始化为t0
     self.times = 0       
     # 各个时刻的单元状态向量c
     self.c_list = self.init_state_vec()
     # 各个时刻的输出向量h
     self.h_list = self.init_state_vec()
     # 各个时刻的遗忘门f
     self.f_list = self.init_state_vec()
     # 各个时刻的输入门i
     self.i_list = self.init_state_vec()
     # 各个时刻的输出门o
     self.o_list = self.init_state_vec()
     # 各个时刻的即时状态c~
     self.ct_list = self.init_state_vec()
     # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wfh, self.Wfx, self.bf = (
         self.init_weight_mat())
     # 输入门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wih, self.Wix, self.bi = (
         self.init_weight_mat())
     # 输出门权重矩阵Wfh, Wfx, 偏置项bf
     self.Woh, self.Wox, self.bo = (
         self.init_weight_mat())
     # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
     self.Wch, self.Wcx, self.bc = (
         self.init_weight_mat())
示例#3
0
 def __init__(self, input_width, state_width, learning_rate):
     self.input_width = input_width
     self.state_width = state_width
     self.learning_rate = learning_rate
     self.gate_activator = SigmoidActivator()
     self.output_activator = TanhActivator()
     self.times = 0
     self.c_list = self.init_state_vec()
     self.h_list = self.init_state_vec()
     self.f_list = self.init_state_vec()
     self.i_list = self.init_state_vec()
     self.o_list = self.init_state_vec()
     self.ct_list = self.init_state_vec()
     self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
     self.Wih, self.Wix, self.bi = (self.init_weight_mat())
     self.Woh, self.Wox, self.bo = (self.init_weight_mat())
     self.Wch, self.Wcx, self.bc = (self.init_weight_mat())
 def __init__(self, input_width, state_width, output_width,
              learning_rate, penaltyL2, momentum):
     self.input_width = input_width
     self.state_width = state_width
     self.output_width= output_width
     self.learning_rate = learning_rate
     self.penaltyL2 = penaltyL2
     self.momentum = momentum
     # 门的激活函数
     self.gate_activator = SigmoidActivator()
     # 输出的激活函数
     self.output_activator = TanhActivator()
     self.class_activator = SoftmaxActivator()
     # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wfh, self.Wfx, self.bf, self.vWfh, self.vWfx, self.vbf =(self.init_weight_mat(0))
     # 输入门权重矩阵Wfh, Wfx, 偏置项bf
     self.Wih, self.Wix, self.bi, self.vWih, self.vWix, self.vbi =(self.init_weight_mat(0))
     # 输出门权重矩阵Wfh, Wfx, 偏置项bf
     self.Woh, self.Wox, self.bo, self.vWoh, self.vWox, self.vbo =(self.init_weight_mat(0))
     # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
     self.Wch, self.Wcx, self.bc, self.vWch, self.vWcx, self.vbc =(self.init_weight_mat(0))
     # 下一层权重Wy, 偏值by
     self.Wy, self.by, self.vWy, self.vby =(self.init_weight_mat(1))
示例#5
0
class LstmLayer(object):
    def __init__(self, input_width, state_width, learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.learning_rate = learning_rate
        # 门的激活函数
        self.gate_activator = SigmoidActivator()
        # 输出的激活函数
        self.output_activator = TanhActivator()
        # 当前时刻初始化为t0
        self.times = 0
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
        # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
        # 输入门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wih, self.Wix, self.bi = (self.init_weight_mat())
        # 输出门权重矩阵Wfh, Wfx, 偏置项bf
        self.Woh, self.Wox, self.bo = (self.init_weight_mat())
        # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
        self.Wch, self.Wcx, self.bc = (self.init_weight_mat())

    def init_state_vec(self):
        '''
        初始化保存状态的向量
        '''
        state_vec_list = []
        state_vec_list.append(np.zeros((self.state_width, 1)))
        return state_vec_list

    def init_weight_mat(self):
        '''
        初始化权重矩阵
        '''
        Wh = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.state_width))
        Wx = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.input_width))
        b = np.zeros((self.state_width, 1))
        return Wh, Wx, b

    def forward(self, x):
        '''
        根据式1-式6进行前向计算
        '''
        self.times += 1
        # 遗忘门
        fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf,
                            self.gate_activator)
        self.f_list.append(fg)
        # 输入门
        ig = self.calc_gate(x, self.Wix, self.Wih, self.bi,
                            self.gate_activator)
        self.i_list.append(ig)
        # 输出门
        og = self.calc_gate(x, self.Wox, self.Woh, self.bo,
                            self.gate_activator)
        self.o_list.append(og)
        # 即时状态
        ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc,
                            self.output_activator)
        self.ct_list.append(ct)
        # 单元状态
        c = fg * self.c_list[self.times - 1] + ig * ct
        self.c_list.append(c)
        # 输出
        h = og * self.output_activator.forward(c)
        self.h_list.append(h)

    def calc_gate(self, x, Wx, Wh, b, activator):
        '''
        计算门
        '''
        h = self.h_list[self.times - 1]  # 上次的LSTM输出
        net = np.dot(Wh, h) + np.dot(Wx, x) + b
        gate = activator.forward(net)
        return gate

    def backward(self, x, delta_h, activator):
        '''
        实现LSTM训练算法
        '''
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)

    def update(self):
        '''
        按照梯度下降,更新权重
        '''
        self.Wfh -= self.learning_rate * self.Whf_grad
        self.Wfx -= self.learning_rate * self.Whx_grad
        self.bf -= self.learning_rate * self.bf_grad
        self.Wih -= self.learning_rate * self.Whi_grad
        self.Wix -= self.learning_rate * self.Whi_grad
        self.bi -= self.learning_rate * self.bi_grad
        self.Woh -= self.learning_rate * self.Wof_grad
        self.Wox -= self.learning_rate * self.Wox_grad
        self.bo -= self.learning_rate * self.bo_grad
        self.Wch -= self.learning_rate * self.Wcf_grad
        self.Wcx -= self.learning_rate * self.Wcx_grad
        self.bc -= self.learning_rate * self.bc_grad

    def calc_delta(self, delta_h, activator):
        # 初始化各个时刻的误差项
        self.delta_h_list = self.init_delta()  # 输出误差项
        self.delta_o_list = self.init_delta()  # 输出门误差项
        self.delta_i_list = self.init_delta()  # 输入门误差项
        self.delta_f_list = self.init_delta()  # 遗忘门误差项
        self.delta_ct_list = self.init_delta()  # 即时输出误差项

        # 保存从上一层传递下来的当前时刻的误差项
        self.delta_h_list[-1] = delta_h

        # 迭代计算每个时刻的误差项
        for k in range(self.times, 0, -1):
            self.calc_delta_k(k)

    def init_delta(self):
        '''
        初始化误差项
        '''
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros((self.state_width, 1)))
        return delta_list

    def calc_delta_k(self, k):
        '''
        根据k时刻的delta_h,计算k时刻的delta_f、
        delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
        '''
        # 获得k时刻前向计算的值
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k - 1]
        tanh_c = self.output_activator.forward(c)
        delta_k = self.delta_h_list[k]

        # 根据式9计算delta_o
        delta_o = (delta_k * tanh_c * self.gate_activator.backward(og))
        delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev *
                   self.gate_activator.backward(fg))
        delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct *
                   self.gate_activator.backward(ig))
        delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig *
                    self.output_activator.backward(ct))
        delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) +
                        np.dot(delta_i.transpose(), self.Wih) +
                        np.dot(delta_f.transpose(), self.Wfh) +
                        np.dot(delta_ct.transpose(), self.Wch)).transpose()

        # 保存全部delta值
        self.delta_h_list[k - 1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    def calc_gradient(self, x):
        # 初始化遗忘门权重梯度矩阵和偏置项
        self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
            self.init_weight_gradient_mat())
        # 初始化输入门权重梯度矩阵和偏置项
        self.Wih_grad, self.Wix_grad, self.bi_grad = (
            self.init_weight_gradient_mat())
        # 初始化输出门权重梯度矩阵和偏置项
        self.Woh_grad, self.Wox_grad, self.bo_grad = (
            self.init_weight_gradient_mat())
        # 初始化单元状态权重梯度矩阵和偏置项
        self.Wch_grad, self.Wcx_grad, self.bc_grad = (
            self.init_weight_gradient_mat())

        # 计算对上一次输出h的权重梯度
        for t in range(self.times, 0, -1):
            # 计算各个时刻的梯度
            (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad,
             bc_grad) = (self.calc_gradient_t(t))
            # 实际梯度是各时刻梯度之和
            self.Wfh_grad += Wfh_grad
            self.bf_grad += bf_grad
            self.Wih_grad += Wih_grad
            self.bi_grad += bi_grad
            self.Woh_grad += Woh_grad
            self.bo_grad += bo_grad
            self.Wch_grad += Wch_grad
            self.bc_grad += bc_grad

        # 计算对本次输入x的权重梯度
        xt = x.transpose()
        self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
        self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
        self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
        self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)

    def init_weight_gradient_mat(self):
        '''
        初始化权重矩阵
        '''
        Wh_grad = np.zeros((self.state_width, self.state_width))
        Wx_grad = np.zeros((self.state_width, self.input_width))
        b_grad = np.zeros((self.state_width, 1))
        return Wh_grad, Wx_grad, b_grad

    def calc_gradient_t(self, t):
        '''
        计算每个时刻t权重的梯度
        '''
        h_prev = self.h_list[t - 1].transpose()
        Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
        bf_grad = self.delta_f_list[t]
        Wih_grad = np.dot(self.delta_i_list[t], h_prev)
        bi_grad = self.delta_f_list[t]
        Woh_grad = np.dot(self.delta_o_list[t], h_prev)
        bo_grad = self.delta_f_list[t]
        Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
        bc_grad = self.delta_ct_list[t]
        return Wfh_grad, bf_grad, Wih_grad, bi_grad, \
               Woh_grad, bo_grad, Wch_grad, bc_grad

    def reset_state(self):
        # 当前时刻初始化为t0
        self.times = 0
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
示例#6
0
class LstmLayer(object):
    '''
	隐藏层相比rnn(只有一个hidden state),额外增加cell state
	怎样控制长期状态cell state?有三个开关,(每个开关的实现涉及到门的输入到输出的激活计算):
		第一个开关,负责控制继续保存长期状态c;
			遗忘门:决定上一个时刻的单元状态Ct-1有多少保留到当前时刻的Ct
					1.遗忘门:Ft=f(Wf*[Ht-1,Xt]+Bf)
		第二个开关,负责控制把即时状态输入到长期状态c;
			输入门:决定当前时刻网络的输入Xt有多少保存到单元状态Ct
					1.输入门:It=f(Wc*[Ht-1,Xt]+Bi)
					2.当前输入的单元状态Ct' = tanh(Wc*[Ht-1,Xt]+Bc)
					3.当前时刻的单元状态(当前的记忆Ct'和长期的记忆Ct-1融合一起)Ct = Ft。Ct-1+It。Ct'
		第三个开关,负责控制是否把长期状态c作为当前的LSTM的输出。
			输出门:控制单元状态Ct有多少输出到LSTM的当前输出值ht;
					1.输出门:Ot = f(Wo*[Ht-1,Xt]+Bo)
					2.LSTM最终的输出:Ht = Ot。tanh(Ct)

	'''
    def __init__(self, input_width, state_width, learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.learning_rate = learning_rate
        # 门的激活函数
        self.gate_activator = SigmoidActivator()
        # 输出的激活函数
        self.output_activator = TanhActivator()
        # 当前时刻初始化为t0
        self.times = 0
        # 各个时刻的单元cell状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
        # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
        # 输入门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wih, self.Wix, self.bi = (self.init_weight_mat())
        # 输出门权重矩阵Wfh, Wfx, 偏置项bf
        self.Woh, self.Wox, self.bo = (self.init_weight_mat())
        # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
        self.Wch, self.Wcx, self.bc = (self.init_weight_mat())

    def init_state_vec(self):
        '''
		初始化保存状态的向量
		'''
        state_vec_list = []
        state_vec_list.append(np.zeros((self.state_width, 1)))
        return state_vec_list

    def init_weight_mat(self):
        '''
		初始化权重矩阵
		'''
        Wh = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.state_width))
        Wx = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.input_width))
        b = np.zeros((self.state_width, 1))
        return Wh, Wx, b

    def forward(self, x):
        '''
		前向传播
		'''
        self.times += 1
        # 遗忘门
        fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf,
                            self.gate_activator)
        self.f_list.append(fg)
        # 输入门
        ig = self.calc_gate(x, self.Wix, self.Wih, self.bi,
                            self.gate_activator)
        self.i_list.append(ig)
        # 输出门
        og = self.calc_gate(x, self.Wox, self.Woh, self.bo,
                            self.gate_activator)
        self.o_list.append(og)
        # 即时状态
        ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc,
                            self.output_activator)
        self.ct_list.append(ct)
        # 单元状态
        c = fg * self.c_list[self.times - 1] + ig * ct
        self.c_list.append(c)
        # 输出
        h = og * self.output_activator(c)
        self.h_list.append(h)

    def calc_gate(self, x, Wx, Wh, b, activator):
        '''
		计算门
		'''
        h = self.h_list[self.times - 1]  # 上次的LSTM输出
        net = np.dot(Wh, h) + np.dot(Wx, x) + b
        gate = activator.forward(net)
        return gate

    def backward(self, x, delta_h, activator):
        '''
		实现lstm训练算法
		'''
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)

    def calc_delta(self, delta_h, activator):
        ##初始化各个时刻误差
        self.delta_h_list = self.init_delta()  # 输出误差项
        self.delta_o_list = self.init_delta()  # 输出门误差项
        self.delta_i_list = self.init_delta()  # 输入门误差项
        self.delta_f_list = self.init_delta()  # 遗忘门误差项
        self.delta_ct_list = self.init_delta()  # 即时输出误差项
        # 保存上一层传递下来的当前时刻的误差项
        self.delta_h_list[-1] = delta_h
        # 迭代计算每个时刻的误差项
        for k in range(self.times, 0, -1):
            self.calc_delta_k(k)

    def calc_delta_k(self, k):
        '''
		t时刻的误差沿着时间的反向传播公式:
		根据k时刻的delta_h,计算k时刻的delta_f、delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
		'''
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k - 1]
        tanh_c = self.output_activator.forward(c)
        delta_k = self.delta_h_list[k]
        delta_o = (delta_k * tanh_c * self.gate_activator.backward(og))
        delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev *
                   self.gate_activator.backward(fg))
        delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct *
                   self.gate_activator.backward(ig))
        delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig *
                    self.output_activator.backward(ct))
        delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) +
                        np.dot(delta_i.transpose(), self.Wih) +
                        np.dot(delta_f.transpose(), self.Wfh) +
                        np.dot(delta_ct.transpose(), self.Wch)).transpose()
        # 保存全部的delta值
        self.delta_h_list[k - 1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    def init_delta(self):
        '''
		初始化误差项
		'''
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros((self.state_width, 1)))
        return delta_list

    def calc_gradient(self, x):
        # 初始化遗忘门权重梯度矩阵和偏置项
        self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
            self.init_weight_gradient_mat())
        # 初始化输入门权重梯度矩阵和偏置项
        self.Wih_grad, self.Wix_grad, self.bi_grad = (
            self.init_weight_gradient_mat())
        # 初始化输出门权重梯度矩阵和偏置项
        self.Woh_grad, self.Wox_grad, self.bo_grad = (
            self.init_weight_gradient_mat())
        # 初始化单元状态权重梯度矩阵和偏置项
        self.Wch_grad, self.Wcx_grad, self.bc_grad = (
            self.init_weight_gradient_mat())
        # 计算对上一次输出h的权重梯度
        for t in range(self.times, 0, -1):
            # 计算各个时刻的梯度
            (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad,
             bc_grad) = (self.calc_gradient_t(t))

    def init_weight_gradient_mat(self):
        '''
		初始化权重矩阵
		'''
        Wh_grad = np.zeros((self.state_width, self.state_width))
        Wx_grad = np.zeros((self.state_width, self.input_width))
        b_grad = np.zeros((self.state_width, 1))
        return Wh_grad, Wx_grad, b_grad

    def calc_gradient_t(self, t):
        '''
		计算每个时刻t权重的梯度
		'''
        h_prev = self.h_list[t - 1].transpose()
        Wf
示例#7
0
文件: lstm.py 项目: ccgcyber/learn_dl
class LstmLayer(object):
    def __init__(self, input_width, state_width, 
                 learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.learning_rate = learning_rate
        # 门的激活函数
        self.gate_activator = SigmoidActivator()
        # 输出的激活函数
        self.output_activator = TanhActivator()
        # 当前时刻初始化为t0
        self.times = 0       
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
        # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wfh, self.Wfx, self.bf = (
            self.init_weight_mat())
        # 输入门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wih, self.Wix, self.bi = (
            self.init_weight_mat())
        # 输出门权重矩阵Wfh, Wfx, 偏置项bf
        self.Woh, self.Wox, self.bo = (
            self.init_weight_mat())
        # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
        self.Wch, self.Wcx, self.bc = (
            self.init_weight_mat())

    def init_state_vec(self):
        '''
        初始化保存状态的向量
        '''
        state_vec_list = []
        state_vec_list.append(np.zeros(
            (self.state_width, 1)))
        return state_vec_list

    def init_weight_mat(self):
        '''
        初始化权重矩阵
        '''
        Wh = np.random.uniform(-1e-4, 1e-4,
            (self.state_width, self.state_width))
        Wx = np.random.uniform(-1e-4, 1e-4,
            (self.state_width, self.input_width))
        b = np.zeros((self.state_width, 1))
        return Wh, Wx, b

    def forward(self, x):
        '''
        根据式1-式6进行前向计算
        '''
        self.times += 1
        # 遗忘门
        fg = self.calc_gate(x, self.Wfx, self.Wfh, 
            self.bf, self.gate_activator)
        self.f_list.append(fg)
        # 输入门
        ig = self.calc_gate(x, self.Wix, self.Wih,
            self.bi, self.gate_activator)
        self.i_list.append(ig)
        # 输出门
        og = self.calc_gate(x, self.Wox, self.Woh,
            self.bo, self.gate_activator)
        self.o_list.append(og)
        # 即时状态
        ct = self.calc_gate(x, self.Wcx, self.Wch,
            self.bc, self.output_activator)
        self.ct_list.append(ct)
        # 单元状态
        c = fg * self.c_list[self.times - 1] + ig * ct
        self.c_list.append(c)
        # 输出
        h = og * self.output_activator.forward(c)
        self.h_list.append(h)

    def calc_gate(self, x, Wx, Wh, b, activator):
        '''
        计算门
        '''
        h = self.h_list[self.times - 1] # 上次的LSTM输出
        net = np.dot(Wh, h) + np.dot(Wx, x) + b
        gate = activator.forward(net)
        return gate


    def backward(self, x, delta_h, activator):
        '''
        实现LSTM训练算法
        '''
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)

    def update(self):
        '''
        按照梯度下降,更新权重
        '''
        self.Wfh -= self.learning_rate * self.Whf_grad
        self.Wfx -= self.learning_rate * self.Whx_grad
        self.bf -= self.learning_rate * self.bf_grad
        self.Wih -= self.learning_rate * self.Whi_grad
        self.Wix -= self.learning_rate * self.Whi_grad
        self.bi -= self.learning_rate * self.bi_grad
        self.Woh -= self.learning_rate * self.Wof_grad
        self.Wox -= self.learning_rate * self.Wox_grad
        self.bo -= self.learning_rate * self.bo_grad
        self.Wch -= self.learning_rate * self.Wcf_grad
        self.Wcx -= self.learning_rate * self.Wcx_grad
        self.bc -= self.learning_rate * self.bc_grad

    def calc_delta(self, delta_h, activator):
        # 初始化各个时刻的误差项
        self.delta_h_list = self.init_delta()  # 输出误差项
        self.delta_o_list = self.init_delta()  # 输出门误差项
        self.delta_i_list = self.init_delta()  # 输入门误差项
        self.delta_f_list = self.init_delta()  # 遗忘门误差项
        self.delta_ct_list = self.init_delta() # 即时输出误差项

        # 保存从上一层传递下来的当前时刻的误差项
        self.delta_h_list[-1] = delta_h
        
        # 迭代计算每个时刻的误差项
        for k in range(self.times, 0, -1):
            self.calc_delta_k(k)

    def init_delta(self):
        '''
        初始化误差项
        '''
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros(
                (self.state_width, 1)))
        return delta_list

    def calc_delta_k(self, k):
        '''
        根据k时刻的delta_h,计算k时刻的delta_f、
        delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
        '''
        # 获得k时刻前向计算的值
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k-1]
        tanh_c = self.output_activator.forward(c)
        delta_k = self.delta_h_list[k]

        # 根据式9计算delta_o
        delta_o = (delta_k * tanh_c * 
            self.gate_activator.backward(og))
        delta_f = (delta_k * og * 
            (1 - tanh_c * tanh_c) * c_prev *
            self.gate_activator.backward(fg))
        delta_i = (delta_k * og * 
            (1 - tanh_c * tanh_c) * ct *
            self.gate_activator.backward(ig))
        delta_ct = (delta_k * og * 
            (1 - tanh_c * tanh_c) * ig *
            self.output_activator.backward(ct))
        delta_h_prev = (
                np.dot(delta_o.transpose(), self.Woh) +
                np.dot(delta_i.transpose(), self.Wih) +
                np.dot(delta_f.transpose(), self.Wfh) +
                np.dot(delta_ct.transpose(), self.Wch)
            ).transpose()

        # 保存全部delta值
        self.delta_h_list[k-1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    def calc_gradient(self, x):
        # 初始化遗忘门权重梯度矩阵和偏置项
        self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
            self.init_weight_gradient_mat())
        # 初始化输入门权重梯度矩阵和偏置项
        self.Wih_grad, self.Wix_grad, self.bi_grad = (
            self.init_weight_gradient_mat())
        # 初始化输出门权重梯度矩阵和偏置项
        self.Woh_grad, self.Wox_grad, self.bo_grad = (
            self.init_weight_gradient_mat())
        # 初始化单元状态权重梯度矩阵和偏置项
        self.Wch_grad, self.Wcx_grad, self.bc_grad = (
            self.init_weight_gradient_mat())

       # 计算对上一次输出h的权重梯度
        for t in range(self.times, 0, -1):
            # 计算各个时刻的梯度
            (Wfh_grad, bf_grad,
            Wih_grad, bi_grad,
            Woh_grad, bo_grad,
            Wch_grad, bc_grad) = (
                self.calc_gradient_t(t))
            # 实际梯度是各时刻梯度之和
            self.Wfh_grad += Wfh_grad
            self.bf_grad += bf_grad
            self.Wih_grad += Wih_grad
            self.bi_grad += bi_grad
            self.Woh_grad += Woh_grad
            self.bo_grad += bo_grad
            self.Wch_grad += Wch_grad
            self.bc_grad += bc_grad

        # 计算对本次输入x的权重梯度
        xt = x.transpose()
        self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
        self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
        self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
        self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)

    def init_weight_gradient_mat(self):
        '''
        初始化权重矩阵
        '''
        Wh_grad = np.zeros((self.state_width,
            self.state_width))
        Wx_grad = np.zeros((self.state_width,
            self.input_width))
        b_grad = np.zeros((self.state_width, 1))
        return Wh_grad, Wx_grad, b_grad

    def calc_gradient_t(self, t):
        '''
        计算每个时刻t权重的梯度
        '''
        h_prev = self.h_list[t-1].transpose()
        Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
        bf_grad = self.delta_f_list[t]
        Wih_grad = np.dot(self.delta_i_list[t], h_prev)
        bi_grad = self.delta_f_list[t]
        Woh_grad = np.dot(self.delta_o_list[t], h_prev)
        bo_grad = self.delta_f_list[t]
        Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
        bc_grad = self.delta_ct_list[t]
        return Wfh_grad, bf_grad, Wih_grad, bi_grad, \
               Woh_grad, bo_grad, Wch_grad, bc_grad

    def reset_state(self):
        # 当前时刻初始化为t0
        self.times = 0       
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
示例#8
0
class LstmLayer(object):
    def __init__(self, input_width, state_width, learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.learning_rate = learning_rate
        self.gate_activator = SigmoidActivator()
        self.output_activator = TanhActivator()
        self.times = 0
        self.c_list = self.init_state_vec()
        self.h_list = self.init_state_vec()
        self.f_list = self.init_state_vec()
        self.i_list = self.init_state_vec()
        self.o_list = self.init_state_vec()
        self.ct_list = self.init_state_vec()
        self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
        self.Wih, self.Wix, self.bi = (self.init_weight_mat())
        self.Woh, self.Wox, self.bo = (self.init_weight_mat())
        self.Wch, self.Wcx, self.bc = (self.init_weight_mat())

    def init_state_vec(self):
        state_vec_list = []
        state_vec_list.append(np.zeros((self.state_width, 1)))
        return state_vec_list

    def init_weight_mat(self):
        Wh = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.state_width))
        Wx = np.random.uniform(-1e-4, 1e-4,
                               (self.state_width, self.input_width))
        b = np.zeros((self.state_width, 1))
        return Wh, Wx, b

    def forward(self, x):
        self.times += 1
        fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf,
                            self.gate_activator)
        self.f_list.append(fg)
        ig = self.calc_gate(x, self.Wix, self.Wih, self.bi,
                            self.gate_activator)
        self.i_list.append(ig)
        og = self.calc_gate(x, self.Wox, self.Woh, self.bo,
                            self.gate_activator)
        self.o_list.append(og)
        ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc,
                            self.output_activator)
        self.ct_list.append(ct)
        c = fg * self.c_list[self.times - 1] + ig * ct
        self.c_list.append(c)
        h = og * self.output_activator.forward(c)
        self.h_list.append(h)

    def calc_gate(self, x, Wx, Wh, b, activator):
        h = self.h_list[self.times - 1]
        net = np.dot(Wh, h) + np.dot(Wx, x) + b
        gate = activator.forward(net)
        return gate

    def backward(self, x, delta_h, activator):
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)

    def update(self):
        self.Wfh -= self.learning_rate * self.Whf_grad
        self.Wfx -= self.learning_rate * self.Whx_grad
        self.bf -= self.learning_rate * self.bf_grad
        self.Wih -= self.learning_rate * self.Whi_grad
        self.Wix -= self.learning_rate * self.Whi_grad
        self.bi -= self.learning_rate * self.bi_grad
        self.Woh -= self.learning_rate * self.Wof_grad
        self.Wox -= self.learning_rate * self.Wox_grad
        self.bo -= self.learning_rate * self.bo_grad
        self.Wch -= self.learning_rate * self.Wcf_grad
        self.Wcx -= self.learning_rate * self.Wcx_grad
        self.bc -= self.learning_rate * self.bc_grad

    def calc_delta(self, delta_h, activator):
        self.delta_h_list = self.init_delta()
        self.delta_o_list = self.init_delta()
        self.delta_i_list = self.init_delta()
        self.delta_f_list = self.init_delta()
        self.delta_ct_list = self.init_delta()

        self.delta_h_list[-1] = delta_h

        for k in range(self.times, 0, -1):
            self.calc_delta_k(k)

    def init_delta(self):
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros((self.state_width, 1)))
        return delta_list

    def calc_delta_k(self, k):
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k - 1]
        tanh_c = self.output_activator.forward(c)
        delta_k = self.delta_h_list[k]

        delta_o = (delta_k * tanh_c * self.gate_activator.backward(og))
        delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev *
                   self.gate_activator.backward(fg))
        delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct *
                   self.gate_activator.backward(ig))
        delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig *
                    self.output_activator.backward(ct))
        delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) +
                        np.dot(delta_i.transpose(), self.Wih) +
                        np.dot(delta_f.transpose(), self.Wfh) +
                        np.dot(delta_ct.transpose(), self.Wch)).transpose()
        self.delta_h_list[k - 1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    def calc_gradient(self, x):
        self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
            self.init_weight_gradient_mat())
        self.Wih_grad, self.Wix_grad, self.bi_grad = (
            self.init_weight_gradient_mat())
        self.Woh_grad, self.Wox_grad, self.bo_grad = (
            self.init_weight_gradient_mat())
        self.Wch_grad, self.Wcx_grad, self.bc_grad = (
            self.init_weight_gradient_mat())

        for t in range(self.times, 0, -1):
            (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad,
             bc_grad) = (self.calc_gradient_t(t))
            self.Wfh_grad += Wfh_grad
            self.bf_grad += bf_grad
            self.Wih_grad += Wih_grad
            self.bi_grad += bi_grad
            self.Woh_grad += Woh_grad
            self.bo_grad += bo_grad
            self.Wch_grad += Wch_grad
            self.bc_grad += bc_grad

        xt = x.transpose()
        self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
        self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
        self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
        self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)

    def init_weight_gradient_mat(self):
        Wh_grad = np.zeros((self.state_width, self.state_width))
        Wx_grad = np.zeros((self.state_width, self.input_width))
        b_grad = np.zeros((self.state_width, 1))
        return Wh_grad, Wx_grad, b_grad

    def calc_gradient_t(self, t):
        h_prev = self.h_list[t - 1].transpose()
        Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
        bf_grad = self.delta_f_list[t]
        Wih_grad = np.dot(self.delta_i_list[t], h_prev)
        bi_grad = self.delta_f_list[t]
        Woh_grad = np.dot(self.delta_o_list[t], h_prev)
        bo_grad = self.delta_f_list[t]
        Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
        bc_grad = self.delta_ct_list[t]
        return Wfh_grad, bf_grad, Wih_grad, bi_grad, \
               Woh_grad, bo_grad, Wch_grad, bc_grad

    def reset_state(self):
        self.times = 0
        self.c_list = self.init_state_vec()
        self.h_list = self.init_state_vec()
        self.f_list = self.init_state_vec()
        self.i_list = self.init_state_vec()
        self.o_list = self.init_state_vec()
        self.ct_list = self.init_state_vec()

    def __str__(self):
        result = 'Wfh:\n%s\nWfx:\n%s\nbf:\n%s\n' % (self.Wfh, self.Wfx,
                                                    self.bf)
        result += 'Wih:\n%s\nWix:\n%s\nbi:\n%s\n' % (self.Wih, self.Wix,
                                                     self.bi)
        result += 'Woh:\n%s\nWox:\n%s\nbo:\n%s\n' % (self.Woh, self.Wox,
                                                     self.bo)
        result += 'Wch:\n%s\nWcx:\n%s\nbc:\n%s\n' % (self.Wch, self.Wcx,
                                                     self.bc)
        result += 'Wfh_grad:\n%s\nWfx_grad:\n%s\nbf_grad:\n%s\n' % (
            self.Wfh_grad, self.Wfx_grad, self.bf_grad)
        result += 'Wih_grad:\n%s\nWix_grad:\n%s\nbi_grad:\n%s\n' % (
            self.Wih_grad, self.Wix_grad, self.bi_grad)
        result += 'Woh_grad:\n%s\nWox_grad:\n%s\nbo_grad:\n%s\n' % (
            self.Woh_grad, self.Wox_grad, self.bo_grad)
        result += 'Wch_grad:\n%s\nWcx_grad:\n%s\nbc_grad:\n%s\n' % (
            self.Wch_grad, self.Wcx_grad, self.bc_grad)
        return result
示例#9
0
class LstmLayer(object):
    def __init__(self, input_width, state_width, learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.learning_rate = learning_rate
        # 门的激活函数
        self.gate_activator = SigmoidActivator()
        # 输出的激活函数
        self.output_activator = TanhActivator()
        # 当前时刻初始化为t0
        self.times = 0
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()

        # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wfh, self.Wfx, self.bf = (self.init_weight_mat())
        # 输入门权重矩阵Wih, Wix, 偏置项bi
        self.Wih, self.Wix, self.bi = (self.init_weight_mat())
        # 输出门权重矩阵Woh, Wox, 偏置项bo
        self.Woh, self.Wox, self.bo = (self.init_weight_mat())
        # 单元状态权重矩阵Wch, Wcx, 偏置项bc
        self.Wch, self.Wcx, self.bc = (self.init_weight_mat())

    ## 初始化保存各类中间状态的向量
    def init_state_vec(self):
        '''
        初始化保存状态的向量
        '''
        state_vec_list = []
        state_vec_list.append(np.zeros((self.state_width, 1)))
        return state_vec_list

    ## 初始化保存各类权重矩阵
    def init_weight_mat(self):
        '''
        初始化权重矩阵
        '''
        Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width))
        Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width))
        b = np.zeros((self.state_width, 1))
        return Wh, Wx, b

    ## forward方法实现了LSTM的前向计算
    def forward(self, x):
        '''
        根据式1-式6进行前向计算
        '''
        self.times += 1
        # 遗忘门
        fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator)
        self.f_list.append(fg)
        # 输入门
        ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator)
        self.i_list.append(ig)
        # 输出门
        og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator)
        self.o_list.append(og)
        # 即时状态
        ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator)
        self.ct_list.append(ct)
        # 单元状态:  Ct = Ft * Ct-1  +  It * c~t
        c = fg * self.c_list[self.times - 1] + ig * ct
        self.c_list.append(c)
        # 输出:      Ht =  Ot *  TANH(Ct)
        h = og * self.output_activator.forward(c)
        self.h_list.append(h)

    ## 门的计算都是相同的算法,而门和c~t的计算仅仅是激活函数不同。因此提出了calc_gate方法,这样减少了很多重复代码。
    def calc_gate(self, x, Wx, Wh, b, activator):
        '''
        计算门
        '''
        h = self.h_list[self.times - 1]  # 上次的LSTM输出
        net = np.dot(Wh, h) + np.dot(Wx, x) + b
        gate = activator.forward(net)
        return gate

    ## backward方法实现了LSTM的反向传播算法。需要注意的是,与backword相关的内部状态变量是在调用backward方法之后才初始化的。
    ## 这种延迟初始化的一个好处是,如果LSTM只是用来推理,那么就不需要初始化这些变量,节省了很多内存。
    def backward(self, x, delta_h, activator):
        '''
        实现LSTM训练算法,主要包含两部分:
        STEP 1: 计算误差项
        STEP 2: 计算梯度
        '''
        self.calc_delta(delta_h, activator)
        self.calc_gradient(x)

    ## 梯度下降算法来更新权重
    def update(self):
        '''
        按照梯度下降,更新权重
        '''
        self.Wfh -= self.learning_rate * self.Whf_grad
        self.Wfx -= self.learning_rate * self.Whx_grad
        self.bf -= self.learning_rate * self.bf_grad
        self.Wih -= self.learning_rate * self.Whi_grad
        self.Wix -= self.learning_rate * self.Whi_grad
        self.bi -= self.learning_rate * self.bi_grad
        self.Woh -= self.learning_rate * self.Wof_grad
        self.Wox -= self.learning_rate * self.Wox_grad
        self.bo -= self.learning_rate * self.bo_grad
        self.Wch -= self.learning_rate * self.Wcf_grad
        self.Wcx -= self.learning_rate * self.Wcx_grad
        self.bc -= self.learning_rate * self.bc_grad

    ##STEP 1: 计算误差项
    def calc_delta(self, delta_h, activator):
        # 初始化各个时刻的误差项
        self.delta_h_list = self.init_delta()  # 输出误差项
        self.delta_o_list = self.init_delta()  # 输出门误差项
        self.delta_i_list = self.init_delta()  # 输入门误差项
        self.delta_f_list = self.init_delta()  # 遗忘门误差项
        self.delta_ct_list = self.init_delta()  # 即时输出误差项
        # 保存从上一层传递下来的当前时刻的误差项,将最后一项【重新赋值】为delta_h
        self.delta_h_list[-1] = delta_h
        # 迭代计算每个时刻的误差项
        for k in range(self.times, 0, -1): # [times,  times-1,  ```  , 1]
            self.calc_delta_k(k)

    ## 初始化各类误差项
    def init_delta(self):
        '''
        初始化误差项,全部初始化为0
        '''
        delta_list = []
        for i in range(self.times + 1):
            delta_list.append(np.zeros((self.state_width, 1)))
        return delta_list

    ## 计算k时刻的误差项
    def calc_delta_k(self, k):
        '''
        根据k时刻的delta_h,计算k时刻的delta_f、delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
        '''
        # 获得k时刻前向计算的值
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k - 1]
        tanh_c = self.output_activator.forward(c)
        delta_k = self.delta_h_list[k]
        # 根据【式9 - 式12】计算delta_o, delta_f, delta_i, delta_ct
        delta_o = (delta_k * tanh_c * self.gate_activator.backward(og))
        delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg))
        delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig))
        delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct))
        # 根据【式8】计算delta_h[k-1]
        delta_h_prev = (
            np.dot(delta_o.transpose(), self.Woh) +
            np.dot(delta_i.transpose(), self.Wih) +
            np.dot(delta_f.transpose(), self.Wfh) +
            np.dot(delta_ct.transpose(), self.Wch)
        ).transpose()

        # 保存全部delta值
        self.delta_h_list[k - 1] = delta_h_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    ##STEP 2: 计算梯度
    def calc_gradient(self, x):
        # 初始化遗忘门权重梯度矩阵和偏置项
        self.Wfh_grad, self.Wfx_grad, self.bf_grad = (self.init_weight_gradient_mat())
        # 初始化输入门权重梯度矩阵和偏置项
        self.Wih_grad, self.Wix_grad, self.bi_grad = (self.init_weight_gradient_mat())
        # 初始化输出门权重梯度矩阵和偏置项
        self.Woh_grad, self.Wox_grad, self.bo_grad = (self.init_weight_gradient_mat())
        # 初始化单元状态权重梯度矩阵和偏置项
        self.Wch_grad, self.Wcx_grad, self.bc_grad = (self.init_weight_gradient_mat())
        # 计算对上一次输出h的权重梯度
        for t in range(self.times, 0, -1):
            # 计算各个时刻的梯度
            (Wfh_grad, bf_grad,
             Wih_grad, bi_grad,
             Woh_grad, bo_grad,
             Wch_grad, bc_grad) = (self.calc_gradient_t(t))
            # 实际梯度是各时刻梯度之和
            self.Wfh_grad += Wfh_grad
            self.bf_grad += bf_grad
            self.Wih_grad += Wih_grad
            self.bi_grad += bi_grad
            self.Woh_grad += Woh_grad
            self.bo_grad += bo_grad
            self.Wch_grad += Wch_grad
            self.bc_grad += bc_grad
        # 计算对本次输入x的权重梯度
        xt = x.transpose()
        self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
        self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
        self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
        self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)

    ## 初始化权重梯度矩阵
    def init_weight_gradient_mat(self):
        '''
        初始化权重梯度矩阵
        '''
        Wh_grad = np.zeros((self.state_width, self.state_width))
        Wx_grad = np.zeros((self.state_width, self.input_width))
        b_grad = np.zeros((self.state_width, 1))
        return Wh_grad, Wx_grad, b_grad

    ## 计算每个时刻t权重的梯度
    def calc_gradient_t(self, t):
        '''
        计算每个时刻t权重的梯度
        '''
        h_prev = self.h_list[t - 1].transpose()
        Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
        bf_grad = self.delta_f_list[t]
        Wih_grad = np.dot(self.delta_i_list[t], h_prev)
        bi_grad = self.delta_f_list[t]
        Woh_grad = np.dot(self.delta_o_list[t], h_prev)
        bo_grad = self.delta_f_list[t]
        Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
        bc_grad = self.delta_ct_list[t]
        return Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad

    ## 和RecurrentLayer一样,为了支持梯度检查,我们需要支持重置内部状态:
    def reset_state(self):
        # 当前时刻初始化为t0
        self.times = 0
        # 各个时刻的单元状态向量c
        self.c_list = self.init_state_vec()
        # 各个时刻的输出向量h
        self.h_list = self.init_state_vec()
        # 各个时刻的遗忘门f
        self.f_list = self.init_state_vec()
        # 各个时刻的输入门i
        self.i_list = self.init_state_vec()
        # 各个时刻的输出门o
        self.o_list = self.init_state_vec()
        # 各个时刻的即时状态c~
        self.ct_list = self.init_state_vec()
class LstmLayer():
    def __init__(self, input_width, state_width, output_width,
                 learning_rate, penaltyL2, momentum):
        self.input_width = input_width
        self.state_width = state_width
        self.output_width= output_width
        self.learning_rate = learning_rate
        self.penaltyL2 = penaltyL2
        self.momentum = momentum
        # 门的激活函数
        self.gate_activator = SigmoidActivator()
        # 输出的激活函数
        self.output_activator = TanhActivator()
        self.class_activator = SoftmaxActivator()
        # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wfh, self.Wfx, self.bf, self.vWfh, self.vWfx, self.vbf =(self.init_weight_mat(0))
        # 输入门权重矩阵Wfh, Wfx, 偏置项bf
        self.Wih, self.Wix, self.bi, self.vWih, self.vWix, self.vbi =(self.init_weight_mat(0))
        # 输出门权重矩阵Wfh, Wfx, 偏置项bf
        self.Woh, self.Wox, self.bo, self.vWoh, self.vWox, self.vbo =(self.init_weight_mat(0))
        # 单元状态权重矩阵Wfh, Wfx, 偏置项bf
        self.Wch, self.Wcx, self.bc, self.vWch, self.vWcx, self.vbc =(self.init_weight_mat(0))
        # 下一层权重Wy, 偏值by
        self.Wy, self.by, self.vWy, self.vby =(self.init_weight_mat(1))


    def init_weight_mat(self,i):
        '''
        初始化权重矩阵
        '''
        if (i<1):
            Wh = np.mat(np.random.uniform(-0.5, 0.5,
            (self.state_width, self.state_width)))/self.state_width
            vWh = np.mat(np.zeros(Wh.shape))
            Wx = np.mat(np.random.uniform(-0.5, 0.5,
            (self.state_width, self.input_width)))/self.input_width
            vWx = np.mat(np.zeros(Wx.shape))
            b = np.mat(np.random.uniform(-0.5,0.5,(self.state_width, 1)))/self.state_width
            vb = np.mat(np.zeros(b.shape))
            return Wh, Wx, b, vWh, vWx, vb
        else:
            Wy = np.mat(np.random.uniform(-0.5, 0.5,
            (self.output_width, self.state_width)))/self.output_width
            vWy = np.mat(np.zeros(Wy.shape))
            by = np.mat(np.random.uniform(-0.5,0.5,(self.output_width, 1)))/self.output_width
            vby = np.mat(np.zeros(by.shape))
            return Wy, by, vWy, vby

    def forward(self, x):
        '''
        根据式1-式6进行前向计算
        '''
        self.x = x
        self.reset_state()
        n = x.shape[0]
        for i in range(n):
            # 遗忘门
            fg = self.calc_gate(x[i], self.Wfx, self.Wfh, 
                self.bf, self.gate_activator,i)
            self.f_list[i] = fg
            # 输入门
            ig = self.calc_gate(x[i], self.Wix, self.Wih,
                self.bi, self.gate_activator,i)
            self.i_list[i] = ig
            # 输出门
            og = self.calc_gate(x[i], self.Wox, self.Woh,
                self.bo, self.gate_activator,i)
            self.o_list[i] = og
            # 即时状态
            ct = self.calc_gate(x[i], self.Wcx, self.Wch,
                self.bc, self.output_activator,i)
            self.ct_list[i] = ct
            # 单元状态
            if i==0:
                c = np.multiply(ig, ct)
            else:
                c = np.multiply(fg, self.c_list[i - 1]) + np.multiply(ig, ct)
            self.c_list[i] = c
            # 输出
            h = np.multiply(og, self.output_activator.forward(c))
            self.h_list[i] = h
            y = self.class_activator.forward(self.Wy * h.T + self.by)
            self.y_list[i] = y.T

    def reset_state(self):
        # 各个时刻的单元状态向量c
        self.c_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        # 各个时刻的输出向量h
        self.h_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        # 各个时刻的遗忘门f
        self.f_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        # 各个时刻的输入门i
        self.i_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        # 各个时刻的输出门o
        self.o_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        # 各个时刻的即时状态c~
        self.ct_list = np.mat(np.zeros((self.x.shape[0], self.state_width)))
        self.y_list = np.mat(np.zeros((self.x.shape[0], self.output_width)))

    def calc_gate(self, x, Wx, Wh, b, activator,i):
        '''
        计算门
        '''
        if i==0:
            h = np.mat(np.zeros((1, self.state_width)))
        else:
            h = self.h_list[i-1] # 上次的LSTM输出
        net = (Wh * h.T + Wx * x.T + b).T
        gate = activator.forward(net)
        return gate


    def backward(self, e):
        '''
        实现LSTM训练算法
        '''
        self.e = e
        self.calc_delta()
        self.calc_gradient()
        self.update()

    def update(self):
        '''
        按照梯度下降,更新权重
        '''
        self.vWfh = self.momentum * self.vWfh - self.learning_rate * \
                (self.Wfh_grad + self.penaltyL2 * self.Wfh)
        self.vWfx = self.momentum * self.vWfx - self.learning_rate * \
                        (self.Wfx_grad + self.penaltyL2 * \
                         np.concatenate((np.mat(np.zeros((self.Wfx.shape[0],1))),self.Wfx[:,1:]),axis=1))
        self.vbf = self.momentum * self.vbf - self.learning_rate * self.bf_grad

        self.vWih = self.momentum * self.vWih - self.learning_rate * \
                        (self.Wih_grad + self.penaltyL2 * self.Wih)
        self.vWix = self.momentum * self.vWix - self.learning_rate * \
                        (self.Wix_grad + self.penaltyL2 * \
                         np.concatenate((np.mat(np.zeros((self.Wix.shape[0],1))),self.Wix[:,1:]),axis=1))
        self.vbi = self.momentum * self.vbi - self.learning_rate * self.bi_grad

        self.vWoh = self.momentum * self.vWoh - self.learning_rate * \
                        (self.Woh_grad + self.penaltyL2 * self.Woh)
        self.vWox = self.momentum * self.vWox - self.learning_rate * \
                        (self.Wox_grad + self.penaltyL2 * \
                         np.concatenate((np.mat(np.zeros((self.Wox.shape[0],1))),self.Wox[:,1:]),axis=1))
        self.vbo = self.momentum * self.vbo - self.learning_rate * self.bo_grad

        self.vWch = self.momentum * self.vWch - self.learning_rate * \
                        (self.Wch_grad + self.penaltyL2 * self.Wch)
        self.vWcx = self.momentum * self.vWcx - self.learning_rate * \
                        (self.Wcx_grad + self.penaltyL2 * \
                         np.concatenate((np.mat(np.zeros((self.Wcx.shape[0],1))),self.Wcx[:,1:]),axis=1))
        self.vbc = self.momentum * self.vbc - self.learning_rate * self.bc_grad

        self.vWy = self.momentum * self.vWy - self.learning_rate * \
                       (self.Wy_grad + self.penaltyL2 * self.Wy)
        self.vby = self.momentum * self.vby - self.learning_rate * self.by_grad


        self.Wfh += self.vWfh
        self.Wfx += self.vWfx
        self.bf  += self.vbf
        self.Wih += self.vWih
        self.Wix += self.vWix
        self.bi  += self.vbi
        self.Woh += self.vWoh
        self.Wox += self.vWox
        self.bo  += self.vbo
        self.Wch += self.vWch
        self.Wcx += self.vWcx
        self.bc  += self.vbc
        self.Wy  += self.vWy
        self.by  += self.vby

    def calc_delta(self):
        # 初始化各个时刻的误差项
        self.delta_h_list = self.init_delta()  # 输出误差项
        self.delta_o_list = self.init_delta()  # 输出门误差项
        self.delta_i_list = self.init_delta()  # 输入门误差项
        self.delta_f_list = self.init_delta()  # 遗忘门误差项
        self.delta_ct_list = self.init_delta() # 即时输出误差项
        self.delta_c_list = self.init_delta()  #state c
        self.delta_h_list[-1] = self.e[-1] * self.Wy
        a = self.output_activator.backward(self.output_activator.forward(self.c_list[-1]))
        self.delta_c_list[-1] = np.multiply(self.delta_h_list[-1], self.o_list[-1], a)
        m = self.e.shape[0]
        for k in range(m-1, 0, -1):
            self.calc_delta_k(k)

    def init_delta(self):
        '''
        初始化误差项
        '''
        delta_list = np.mat(np.zeros((self.e.shape[0],self.state_width)))
        return delta_list

    def calc_delta_k(self, k):
        '''
        根据k时刻的delta_h,计算k时刻的delta_f、
        delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
        '''
        # 获得k时刻前向计算的值
        ig = self.i_list[k]
        og = self.o_list[k]
        fg = self.f_list[k]
        ct = self.ct_list[k]
        c = self.c_list[k]
        c_prev = self.c_list[k-1]
        tan_c = self.output_activator.forward(c)
        delta_h = self.delta_h_list[k]
        delta_c = self.delta_c_list[k]
        delta_y = self.e[k-1]
        # 根据式9计算delta_o
        gate_o = np.multiply(tan_c, self.gate_activator.backward(og))
        delta_o = np.multiply(delta_h, gate_o)
        gate_f = np.multiply(c_prev, self.gate_activator.backward(fg))
        delta_f = np.multiply(delta_c, gate_f)
        gate_i = np.multiply(ct, self.gate_activator.backward(ig))
        delta_i = np.multiply(delta_c, gate_i)
        gate_c = np.multiply(ig, self.output_activator.backward(ct))
        delta_ct = np.multiply(delta_c, gate_c)

        delc = np.multiply(og, self.output_activator.backward(tan_c))
        delta_h_prev = np.multiply(delta_h, (gate_o * self.Woh + \
                np.multiply(gate_i, delc) * self.Wih + \
                np.multiply(gate_f, delc) * self.Wfh + \
                np.multiply(gate_c, delc) * self.Wch)) + delta_y * self.Wy
        delc1 = np.multiply(self.o_list[k-1],self.output_activator.backward(self.output_activator.forward(c_prev)))
        delta_c_prev = np.multiply(delta_c, fg) + np.multiply(delta_h_prev, delc1)
        # 保存全部delta值
        self.delta_h_list[k-1] = delta_h_prev
        self.delta_c_list[k-1] = delta_c_prev
        self.delta_f_list[k] = delta_f
        self.delta_i_list[k] = delta_i
        self.delta_o_list[k] = delta_o
        self.delta_ct_list[k] = delta_ct

    def calc_gradient(self):
        # 初始化遗忘门权重梯度矩阵和偏置项
        Wfh_grad, Wfx_grad, bf_grad = (
            self.init_weight_gradient_mat(0))
        # 初始化输入门权重梯度矩阵和偏置项
        Wih_grad, Wix_grad, bi_grad = (
            self.init_weight_gradient_mat(0))
        # 初始化输出门权重梯度矩阵和偏置项
        Woh_grad, Wox_grad, bo_grad = (
            self.init_weight_gradient_mat(0))
        # 初始化单元状态权重梯度矩阵和偏置项
        Wch_grad, Wcx_grad, bc_grad = (
            self.init_weight_gradient_mat(0))
        Wy_grad, by_grad = (self.init_weight_gradient_mat(1))
        m = self.e.shape[0]
        # 计算对上一次输出h的权重梯度
        for t in range(m-1, 0, -1):
            h = self.h_list[t]
            h_prev = self.h_list[t - 1]
            x = self.x[t]
            Wfh_grad += self.delta_f_list[t].T * h_prev
            Wfx_grad += self.delta_f_list[t].T * x
            bf_grad += self.delta_f_list[t].T
            Wih_grad += self.delta_i_list[t].T * h_prev
            Wix_grad += self.delta_i_list[t].T * x
            bi_grad += self.delta_i_list[t].T
            Woh_grad += self.delta_o_list[t].T * h_prev
            Wox_grad += self.delta_o_list[t].T * x
            bo_grad += self.delta_o_list[t].T
            Wch_grad += self.delta_ct_list[t].T * h_prev
            Wcx_grad += self.delta_ct_list[t].T * x
            bc_grad += self.delta_ct_list[t].T
            Wy_grad += self.e[t].T * h
            by_grad += self.e[t].T

        self.Wfh_grad = Wfh_grad/(m-1)
        self.Wfx_grad = Wfx_grad/m
        self.bf_grad = bf_grad/m
        self.Wih_grad = Wih_grad/(m-1)
        self.Wix_grad = Wix_grad/m
        self.bi_grad = bi_grad/m
        self.Woh_grad = Woh_grad/(m-1)
        self.Wox_grad = Wox_grad/m
        self.bo_grad = bo_grad/m
        self.Wch_grad = Wch_grad/(m-1)
        self.Wcx_grad = Wcx_grad/m
        self.bc_grad = bc_grad/m
        self.Wy_grad = Wy_grad/m
        self.by_grad = by_grad/m

    def init_weight_gradient_mat(self,i):
        '''
        初始化权重矩阵
        '''
        if i<1:
            Wh_grad = np.mat(np.zeros((self.state_width, self.state_width)))
            Wx_grad = np.mat(np.zeros((self.state_width, self.input_width)))
            b_grad = np.mat(np.zeros((self.state_width, 1)))
            return Wh_grad, Wx_grad, b_grad
        else:
            Wy_grad = np.mat(np.zeros((self.output_width, self.state_width)))
            by_grad = np.mat(np.zeros((self.output_width, 1)))
            return Wy_grad, by_grad