def step(self, x, states): """ LSTM的几个表达式都在这 :param x: :param states: 上个时刻的输出和隐层状态st :return: """ ytm, stm = states # repeat the hidden state to the length of the sequence # 按照steps的维度重复n次,(sample, step, dim) _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) # softmax at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def step(self, x, states, training=None): h_tm1 = states[0] c_tm1 = states[1] x_seq = states[2] # repeat the hidden state to the length of the sequence _htm = K.repeat(h_tm1, self.time_step_e) #(batch,time_step,units) # concatenate a(previus output lstm) + hidden state concatenate = K.concatenate( [_htm, x_seq], axis=-1) #(batch,time_step,h_units+x_seq_units) # now multiplty the weight matrix with the repeated hidden state # apply the a dense layer over the time dimension of the sequence # do it here because it doesn't depend on any previous steps # thefore we can save computation time: dot_dense = time_distributed_dense( concatenate, self.W_cx, b=self.b_cx, input_dim=self.input_dim_e + self.units, timesteps=self.time_step_e, output_dim=self.atten_units) #(samples,timestep,atten_units) # we need to supply the full sequence of inputs to step (as the attention_vector) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot( K.relu(dot_dense), #(batch,time_step,atten_units) K.expand_dims(self.C_cx)) at = K.exp(et) #(batch,time_step,1) at_sum = K.cast(K.sum(at, axis=1) + K.epsilon(), K.floatx()) #(batch,1) at_sum_repeated = K.repeat(at_sum, self.time_step_e) #(batch,time_step,1) at /= at_sum_repeated # vector of size (batchsize, time_steps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, x_seq, axes=1), axis=1) #(batchsize,input_dim) if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask(K.ones_like(context), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( K.ones_like(states[0]), self.recurrent_dropout, training=training, count=4) # dropout matrices for input units B_W = self._dropout_mask # dropout matrices for recurrent units B_U = self._recurrent_dropout_mask # ~~~> calculate new hidden state yhat_i = K.dot(x, self.V_i) #(batchsize,units) yhat_f = K.dot(x, self.V_f) #(batchsize,units) yhat_c = K.dot(x, self.V_c) #(batchsize,units) yhat_o = K.dot(x, self.V_o) #(batchsize,units) if 0 < self.dropout < 1.: x_i = K.dot(context * B_W[0], self.W_i) + self.b_i #(batchsize,units) x_f = K.dot(context * B_W[1], self.W_f) + self.b_f #(batchsize,units) x_c = K.dot(context * B_W[2], self.W_c) + self.b_c #(batchsize,units) x_o = K.dot(context * B_W[3], self.W_o) + self.b_o #(batchsize,units) else: x_i = K.dot(context, self.W_i) + self.b_i #(batchsize,units) x_f = K.dot(context, self.W_f) + self.b_f #(batchsize,units) x_c = K.dot(context, self.W_c) + self.b_c #(batchsize,units) x_o = K.dot(context, self.W_o) + self.b_o #(batchsize,units) if 0 < self.recurrent_dropout < 1.: h_tm1_i = K.dot(h_tm1 * B_U[0], self.U_i) #(batchsize,units) h_tm1_f = K.dot(h_tm1 * B_U[1], self.U_f) #(batchsize,units) h_tm1_c = K.dot(h_tm1 * B_U[2], self.U_c) #(batchsize,units) h_tm1_o = K.dot(h_tm1 * B_U[3], self.U_o) #(batchsize,units) else: h_tm1_i = K.dot(h_tm1, self.U_i) #(batchsize,units) h_tm1_f = K.dot(h_tm1, self.U_f) #(batchsize,units) h_tm1_c = K.dot(h_tm1, self.U_c) #(batchsize,units) h_tm1_o = K.dot(h_tm1, self.U_o) #(batchsize,units) i = self.recurrent_activation(x_i + h_tm1_i + yhat_i) #(batchsize,units) f = self.recurrent_activation(x_f + h_tm1_f + yhat_f) #(batchsize,units) o = self.recurrent_activation(x_o + h_tm1_o + yhat_o) #(batchsize,units) c_ = self.activation(x_c + h_tm1_c + yhat_c) #(batchsize,units) c = f * c_tm1 + i * c_ h = o * self.activation(c) #(batchsize,units) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True #apply maxout layer with dropout maxout = self.max_out(inputs=K.dot(h, self.U_p), num_units=self.gmax) drop = Dropout(0.3)(maxout) #apply softmax _y_hat = activations.softmax(K.dot(drop, self.M_p) + self.b_p) if self.return_probabilities: return at, [h, c] else: return _y_hat, [h, c]
def call(self, inputs, states, constants): ''' call函数 会在RNN中被调用然后被RNN改写 此时constant参数可用 :param inputs: [wt; v_g] 维度为self.input_dim :param states: 前一步ht,mt :param constants: cnn_encoder outputs :return: ''' h_tm = states[0] # last hidden state m_tm = states[1] # last memory cell self.v_seq = constants[ 0] # [self.cnn_encoder_k, self.units] self.units=cnn_encoder_d """ f-gate """ ft = activations.sigmoid( K.dot(h_tm, self.W_f) + K.dot(inputs, self.U_f) + self.b_f) """ i-gate """ it = activations.sigmoid( K.dot(h_tm, self.W_i) + K.dot(inputs, self.U_i) + self.b_i) """ o-gate """ ot = activations.sigmoid( K.dot(h_tm, self.W_o) + K.dot(inputs, self.U_o) + self.b_o) """ g-gate (sentinel gate) """ gt = activations.sigmoid( K.dot(h_tm, self.W_g) + K.dot(inputs, self.U_g) + self.b_g) """ at-renew input """ at = activations.tanh( K.dot(h_tm, self.W_a) + K.dot(inputs, self.U_a) + self.b_a) """ mt-memory cell """ mt = m_tm * ft + it * at """ ht-hidden state """ ht = ot * activations.tanh(mt) """ st-visual sentinel """ st = gt * activations.tanh(mt) """ ct-visual context """ st = K.expand_dims(st, axis=1) # 将st合并进来一起计算权重参数[?, k+1, d] d=self.units 与论文的处理稍有不同 self.v_expand = K.concatenate([self.v_seq, st], axis=1) # one_matrix = K.ones((self.cnn_encoder_k + 1, 1)) vtt = K.dot(self.v_expand, self.W_z) dtt = K.repeat(K.dot(ht, self.U_z), self.cnn_encoder_k + 1) # (?, k + 1, k + 1) tantt = K.tanh(vtt + dtt) zt = K.dot(tantt, self.W_h) alpha_t = activations.softmax(zt) # (?, k + 1, 1) # alpha_t = K.expand_dims(alpha_t) # (?, k + 1, 1) # 将st,v1,...,vk包括在内直接加权求和 与论文的处理稍有不同 (?, k + 1, units) # 输出(?, units) ct = K.squeeze(K.batch_dot(alpha_t, self.v_expand, axes=1), axis=1) # batch_dot 针对 k + 1 ht_plus_ct = ht + ct return ht_plus_ct, [ht, mt]
def call(self, inputs): return activations.softmax(inputs, axis=self.axis)