def call(self, inputs, **kwargs): input_shape = K.int_shape(inputs) def step_f(inputs, states): output, new_states = self.cell_f.call(inputs, states, **kwargs) return output, new_states def step_b(inputs, states): output, new_states = self.cell_b.call(inputs, states, **kwargs) return output, new_states initial_states_f = self.cell_f.get_initial_state(inputs) initial_states_b = self.cell_b.get_initial_state(inputs) last_output_f, outputs_f, states_f = K.rnn(step_f, inputs, initial_states=initial_states_f, go_backwards=False, input_length=input_shape[1]) last_output_b, outputs_b, states_b = K.rnn(step_b, inputs, initial_states=initial_states_b, go_backwards=True, input_length=input_shape[1]) last_output = K.concatenate([last_output_f, last_output_b]) outputs = K.concatenate([outputs_f, outputs_b]) if self.return_sequences: return outputs else: return last_output
def call(self, inputs): """ inputs: [encoder_output_sequence, decoder_output_sequence, encoder_last_state] """ assert type(inputs) == list encoder_out_seq, decoder_out_seq, initState = inputs projected_context = K.dot(encoder_out_seq, self.W_projected) + self.B_projected dec_hidden = decoder_out_seq.shape[2] timesteps = encoder_out_seq.shape[1] def step(inputs, states): state = states[0] ha = K.expand_dims(K.dot(state, self.W_a), 1) e = K.tanh(projected_context + ha) alphas = K.exp(K.dot(e, self.V_a)) alphas = K.reshape(alphas, (-1, timesteps)) alphas = alphas / (K.sum(alphas, axis=1, keepdims=True) + K.epsilon()) weighted_context = encoder_out_seq * alphas[:, :, None] weighted_context = K.sum(weighted_context, axis=1) return weighted_context, [inputs] # initState = K.zeros_like(projected_context[:, 1, 0:dec_hidden]) _, wc, _ = K.rnn(step, decoder_out_seq, [initState]) return wc
def call(self, inputs, mask=None, training=None, initial_state=None, constants=None): if isinstance(inputs, list): if self._num_constants is None: initial_state = inputs[1:] else: initial_state = inputs[1:-self._num_constants] inputs = inputs[0] input_shape = K.int_shape(inputs) timesteps = input_shape[1] kwargs = {} def step(inputs, states): constants = states[-self._num_constants:] states = states[:-self._num_constants] return self.cell.call(inputs, states, constants=constants, **kwargs) last_output, outputs, states = K.rnn(step, inputs, initial_state, constants=constants, go_backwards=self.go_backwards, mask=mask, unroll=False, input_length=timesteps) output = outputs if self.return_sequences else last_output return output
def call(self, x, mask=None): input_shape = self.input_spec[0].shape #input_shape = (None, 4, 512, 30, 40) initial_states = self.get_initial_states(x) #x.shape = (1, 4, 512, 30, 40) output shape=(1, 512, 30, 40) constants = self.get_constants(x) preprocessed_input = self.preprocess_input(x) #output shape=(1, 4, 512, 30, 40) #print("______________Llegamos________________") last_output, outputs, states = K.rnn(self.step, preprocessed_input, initial_states, go_backwards=False, mask=mask, constants=constants, unroll=False, input_length=input_shape[1]) #print("Estamos en la salida ______________________") #print(last_output) # shape=(1, 512, 30, 40) #print(outputs) # shape=(1, 4, 512, 30, 40) #print(states) # shape=(1, 512, 30, 40) if last_output.get_shape().ndims == 3: #Nueva version last_output = K.expand_dims(last_output, dim=0) #if last_output.ndim == 3: # last_output = K.expand_dims(last_output, dim=0) print("Red attentive convLSTM cargada") return last_output
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 # [B, T, 1] 这里也就是看 这个 T 是否是 pad 的 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.target_score(y_true, y_pred) # 递归计算log Z init_states = [y_pred[:, 0]] # [B, T, output_dim] [B, T, 1] -> [B, T, output_dim+1] # 这里是为了传递 mask 到 rnn 中 y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] # 这里会把 y_pred[:, 1:] 在 time 维度 split 开 一个个继续迭代计算 log_norm, _, _ = K.rnn( self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length ) # 最后一步的log Z向量 log_norm = tf.reduce_logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, x, mask=None): # input shape: (nb_samples, time (padded with zeros), input_dim) input_shape = self.input_spec[0].shape if isinstance(x, (tuple, list)): # x, *custom_initial = x custom_initial = x[1:] x = x[0] else: custom_initial = None if K._BACKEND == 'tensorflow': if not input_shape[1]: raise Exception('When using TensorFlow, you should define ' 'explicitly the number of timesteps of ' 'your sequences.\n' 'If your first layer is an Embedding, ' 'make sure to pass it an "input_length" ' 'argument. Otherwise, make sure ' 'the first layer has ' 'an "input_shape" or "batch_input_shape" ' 'argument, including the time axis. ' 'Found input shape at layer ' + self.name + ': ' + str(input_shape)) if self.stateful and custom_initial: raise Exception(('Initial states should not be specified ' 'for stateful LSTMs, since they would overwrite ' 'the memorized states.')) elif custom_initial: initial_states = custom_initial elif self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(x) constants = self.get_constants(x) preprocessed_input = self.preprocess_input(x) # only use the main input mask if isinstance(mask, list): mask = mask[0] last_output, outputs, states = K.rnn(self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_shape[1]) if self.stateful: self.updates = [] for i in range(len(states)): self.updates.append((self.states[i], states[i])) if self.return_sequences: return [outputs] + states else: return [last_output] + states
def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 if self.ignore_last_label: mask = 1 - y_true[:, :, -1:] else: mask = K.ones_like(y_pred[:, :, :1]) y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] path_score = self.path_score(y_pred, y_true) # 计算分子(对数) init_states = [y_pred[:, 0]] # 初始状态 y_pred = K.concatenate([y_pred, mask]) log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states) # 计算Z向量(对数) log_norm = logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) return log_norm - path_score # 即log(分子/分母)
def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 mask = 1 - y_true[:, 1:, -1] if self.ignore_last_label else None y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self. num_labels] init_states = [y_pred[:, 0]] # 初始状态 log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, mask=mask) # 计算Z向量(对数) log_norm = tf.math.reduce_logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) path_score = self.path_score(y_pred, y_true) # 计算分子(对数) return tf.math.subtract(log_norm, path_score) # 即log(分子/分母)
def recursion(self, input_energy, mask=None, go_backwards=False, return_sequences=True, return_logZ=True, input_length=None): """Forward (alpha) or backward (beta) recursion If `return_logZ = True`, compute the logZ, the normalization constant: \[ Z = \sum_{y1, y2, y3} exp(-E) # energy = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3)) = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3)) sum_{y1} exp(-(u1' y1' + y1' W y2))) \] Denote: \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \] \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \] \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \] Note that: yi's are one-hot vectors u1, u3: boundary energies have been merged If `return_logZ = False`, compute the Viterbi's best path lookup table. """ chain_energy = self.chain_kernel # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t chain_energy = K.expand_dims(chain_energy, 0) # shape=(B, F), dtype=float32 prev_target_val = K.zeros_like(input_energy[:, 0, :]) if go_backwards: input_energy = K.reverse(input_energy, 1) if mask is not None: mask = K.reverse(mask, 1) initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])] constants = [chain_energy] if mask is not None: mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()) constants.append(mask2) def _step(input_energy_i, states): return self.step(input_energy_i, states, return_logZ) target_val_last, target_val_seq, _ = K.rnn(_step, input_energy, initial_states, constants=constants, input_length=input_length, unroll=self.unroll) if return_sequences: if go_backwards: target_val_seq = K.reverse(target_val_seq, 1) return target_val_seq else: return target_val_last
def call(self, inputs): initial_states = [ K.zeros((K.shape(inputs)[0], self.units)), K.zeros((K.shape(inputs)[0], self.units)) ] # 定义初始态(全零) outputs = K.rnn(self.one_step, inputs, initial_states) self.distance = 1 - K.mean( outputs[1][..., self.units:self.units + self.levels], -1) self.distance_in = K.mean(outputs[1][..., self.units + self.levels:], -1) if self.return_sequences: return outputs[1][..., :self.units] else: return outputs[0][..., :self.units]
def _forward(x, reduce_step, initial_states, U): """Forward recurrence of the linear chain crf.""" def _forward_step(energy_matrix_t, states): alpha_tm1 = states[-1] new_states = reduce_step(K.expand_dims(alpha_tm1, 2) + energy_matrix_t) return new_states[0], new_states U_shared = K.expand_dims(K.expand_dims(U, 0), 0) inputs = K.expand_dims(x[:, 1:, :], 2) + U_shared inputs = K.concatenate([inputs, K.zeros_like(inputs[:, -1:, :, :])], axis=1) last, values, _ = K.rnn(_forward_step, inputs, initial_states) return last, values
def viterbi_decoding(self, X, mask=None): input_energy = self.activation(K.dot(X, self.kernel) + self.bias) if self.use_boundary: input_energy = self.add_boundary_energy(input_energy, mask, self.left_boundary, self.right_boundary) argmin_tables = self.recursion(input_energy, mask, return_logZ=False) argmin_tables = K.cast(argmin_tables, 'int32') # backward to find best path, `initial_best_idx` can be any, # as all elements in the last argmin_table are the same argmin_tables = K.reverse(argmin_tables, 1) # matrix instead of vector is required by tf `K.rnn` initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])] if K.backend() == 'theano': from theano import tensor as T initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)] def gather_each_row(params, indices): n = K.shape(indices)[0] if K.backend() == 'theano': from theano import tensor as T return params[T.arange(n), indices] elif K.backend() == 'tensorflow': import tensorflow as tf indices = K.transpose(K.stack([tf.range(n), indices])) return tf.gather_nd(params, indices) else: raise NotImplementedError def find_path(argmin_table, best_idx): next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0]) next_best_idx = K.expand_dims(next_best_idx) if K.backend() == 'theano': from theano import tensor as T next_best_idx = T.unbroadcast(next_best_idx, 1) return next_best_idx, [next_best_idx] _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx, input_length=K.int_shape(X)[1], unroll=self.unroll) best_paths = K.reverse(best_paths, 1) best_paths = K.squeeze(best_paths, 2) return K.one_hot(best_paths, self.units)
def _backward(gamma): """Backward recurrence of the linear chain crf.""" gamma = K.cast(gamma, "int32") def _backward_step(gamma_t, states): y_tm1 = K.squeeze(states[0], 0) y_t = batch_gather(gamma_t, y_tm1) return y_t, [K.expand_dims(y_t, 0)] initial_states = [K.expand_dims(K.zeros_like(gamma[:, 0, 0]), 0)] _, y_rev, _ = K.rnn(_backward_step, gamma, initial_states, go_backwards=True) y = K.reverse(y_rev, 1) return y
def call(self, inputs, **kwargs): input_shape = K.int_shape(inputs) def step_fn(inputs, states): output, new_states = self.cell.call(inputs, states, **kwargs) return output, new_states initial_states = self.cell.get_initial_state(inputs) last_output, outputs, states = K.rnn(step_fn, inputs, initial_states=initial_states, go_backwards=self.reversed, input_length=input_shape[1]) if self.return_sequences: return states else: return last_output
def call(self, x, training=None, mask=None, states=None): """ x.shape=(batch_size,time_step,dim)=(3,10,128),#x is encoder ouput :param Tensor x: Should be the output of the decoder :param Tensor states: last state of the decoder :param Tensor mask: The mask to apply :return: Pointers probabilities """ input_shape = self.input_spec[0].shape en_seq = x #TensorShape([3, 10, 128]) x_input = x[:, input_shape[1] - 1, :] ##只取最后一个时间戳的,TensorShape([3, 128]) #重复一个2D张量。如果x具有shape(samples, dim),并且n是2,则输出将有shape(samples, 2, dim),在第二个维度将数据重复 x_input = K.repeat(x_input, input_shape[1]) #TensorShape([3, 10, 128]) if states: initial_states = states else: initial_states = self.decoder.get_initial_state(x_input) constants = [] '''preprocessed_input.shape TensorShape([64, 10, 128])''' preprocessed_input, _, constants = self.decoder.process_inputs( x_input, initial_states, constants) constants.append(en_seq) #self.step(preprocessed_input,initial_states) ##这里preprocessed_input有时间维度,然后每个时间维度的数据,都要传给step函数调用 ''' k.rnn返回一个元组,(last_output, outputs, new_states),实现了step的递归调用 last_output:shape为(samples, ...) 输出的rnn的最新输出。 outputs:shape为(samples, time, ...)的张量,其中每个条目 outputs[s, t] 是样本 s 在时间 t 的步骤函数输出值。即step的输出,维度为(batch, 10)(无时间维度) new_states:张量列表,步长函数返回的最新状态,shape为(samples, ...)。 ''' last_output, outputs, states = K.rnn( self.step, preprocessed_input, initial_states, go_backwards=self.decoder.lstm.go_backwards, constants=constants, input_length=input_shape[1]) # print('outputs',outputs.shape,outputs)#outputs (batch, 10, 10) return outputs
def call(self, input, initial_state=None): img, text = input def step(cell_inputs, cell_states): """Step function that will be used by Keras RNN backend.""" h_tm1 = cell_states[0] features = self.attention(img, h_tm1) cell_inputs = K.concatenate([cell_inputs, features], axis=-1) # inputs projected by all gate matrices at once matrix_x = K.dot(cell_inputs, self.kernel) matrix_x = K.bias_add(matrix_x, self.input_bias) x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1) # hidden state projected by all gate matrices at once matrix_inner = K.dot(h_tm1, self.recurrent_kernel) matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias) recurrent_z, recurrent_r, recurrent_h = array_ops.split( matrix_inner, 3, axis=1) z = K.sigmoid(x_z + recurrent_z) r = K.sigmoid(x_r + recurrent_r) hh = K.tanh(x_h + r * recurrent_h) # previous and candidate state mixed by update gate h = z * h_tm1 + (1 - z) * hh return h, [h] if initial_state is None: initial_state = (array_ops.zeros( (array_ops.shape(text)[0], self.units)), ) last, sequence, hidden = K.rnn(step, text, initial_state, zero_output_for_mask=self.mask_zeros) if self.return_state and self.return_sequences: return sequence, hidden if self.return_state: return last, hidden if self.return_sequences: return sequence return last
def _forward(x, reduce_step, initial_states, U, mask=None): '''Forward recurrence of the linear chain crf.''' def _forward_step(energy_matrix_t, states): alpha_tm1 = states[-1] new_states = reduce_step(K.expand_dims(alpha_tm1, 2) + energy_matrix_t) return new_states[0], new_states U_shared = K.expand_dims(K.expand_dims(U, 0), 0) if mask is not None: mask = K.cast(mask, K.floatx()) mask_U = K.expand_dims(K.expand_dims(mask[:, :-1] * mask[:, 1:], 2), 3) U_shared = U_shared * mask_U inputs = K.expand_dims(x[:, 1:, :], 2) + U_shared inputs = K.concatenate([inputs, K.zeros_like(inputs[:, -1:, :, :])], axis=1) last, values, _ = K.rnn(_forward_step, inputs, initial_states) return last, values
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.target_score(y_true, y_pred) # 递归计算log Z init_states = [y_pred[:, 0]] y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] log_norm, _, _ = K.rnn( self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length ) # 最后一步的log Z向量 log_norm = tf.reduce_logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def decide_placement(candidate, mask_template, decay_grad): # 文字候補をもとに可能な配置を決定する _, char_width = mask_template.shape[:2] mask_template_T = K.transpose( mask_template[0, :, 0, :]) # shape=(char_dim, char_width) def step_fn( inputs, states ): # shape=(batch_size, height, char_dim), (batch_size, height, char_width) s = states[0] placement_t = combine_value_gradient( 1.0 - s[:, :, :1], decay_grad * (1.0 - s[:, :, :1])) # 勾配が減衰しないと学習が難しすぎるため対策 s = s + placement_t * K.dot(inputs, mask_template_T) new_state = K.concatenate([s[:, :, 1:], K.zeros_like(s[:, :, :1]) ]) # shape=(batch_size, height, char_width) return placement_t, [new_state] initial_state = K.zeros_like(candidate[:, :, :char_width, 0]) candidate_t = tf.transpose(candidate, perm=[0, 2, 1, 3]) _, placement_t, _ = K.rnn(step_fn, candidate_t, [initial_state]) return tf.transpose(placement_t, perm=[0, 2, 1, 3])
def _backward(gamma, mask): '''Backward recurrence of the linear chain crf.''' gamma = K.cast(gamma, 'int32') def _backward_step(gamma_t, states): y_tm1 = K.squeeze(states[0], 0) y_t = batch_gather(gamma_t, y_tm1) return y_t, [K.expand_dims(y_t, 0)] initial_states = [K.expand_dims(K.zeros_like(gamma[:, 0, 0]), 0)] _, y_rev, _ = K.rnn(_backward_step, gamma, initial_states, go_backwards=True) y = K.reverse(y_rev, 1) if mask is not None: mask = K.cast(mask, dtype='int32') # mask output y *= mask # set masked values to -1 y += -(1 - mask) return y
def call(self, x, constants=None, mask=None, initial_state=None): # input shape: (n_samples, time (padded with zeros), input_dim) input_shape = self.input_spec[0].shape if len(x) > 2: initial_state = x[2:] x = x[:2] assert len(initial_state) >= 1 static_x = x[1] x = x[0] if self.layer.stateful: initial_states = self.layer.states elif initial_state is not None: initial_states = initial_state if not isinstance(initial_states, (list, tuple)): initial_states = [initial_states] else: initial_states = self.layer.get_initial_state(x) if not constants: constants = [] constants += self.get_constants(static_x) last_output, outputs, states = K.rnn( self.step, x, initial_states, go_backwards=self.layer.go_backwards, mask=mask, constants=constants, unroll=self.layer.unroll, input_length=input_shape[1]) # output has at the moment the form: # (real_output, attention) # split this now up output_dim = self.layer.compute_output_shape(input_shape)[0][-1] last_output = last_output[:output_dim] attentions = outputs[:, :, output_dim:] outputs = outputs[:, :, :output_dim] if self.layer.stateful: self.updates = [] for i in range(len(states)): self.updates.append((self.layer.states[i], states[i])) if self.layer.return_sequences: output = outputs else: output = last_output # Properly set learning phase if getattr(last_output, '_uses_learning_phase', False): output._uses_learning_phase = True for state in states: state._uses_learning_phase = True if self.layer.return_state: if not isinstance(states, (list, tuple)): states = [states] else: states = list(states) output = [output] + states if self.return_attention: if not isinstance(output, list): output = [output] output = output + [attentions] return output
#encoding=utf8 import os import numpy as np os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" from tensorflow.keras import backend as K import tensorflow as tf batch_size = 1 time_step = 3 dim = 2 x = np.random.rand(batch_size, time_step, dim) # [1,3,2]生成输入 init_state = np.zeros(1).reshape(1, 1) # [1,1] 初始值设置为0 def step_func(inputs, states): o = K.sum(inputs, axis=1, keepdims=True) + states[0] return o, [o] a, _, _ = K.rnn(step_func, inputs=x, initial_states=[init_state]) print("x", x)
import tensorflow.keras.backend as K if K.backend() == 'tensorflow': from .tensorflow_backend import * rnn = lambda *args, **kwargs: K.rnn(*args, **kwargs) + ([], ) elif K.backend() == 'theano': from .theano_backend import * else: raise Exception(K.backend() + ' backend is not supported.')
def call(self, x, constants=None, mask=None, initial_state=None): # input shape: (n_samples, time (padded with zeros), input_dim) input_shape = self.input_spec.shape if self.layer.stateful: initial_states = self.layer.states elif initial_state is not None: initial_states = initial_state if not isinstance(initial_states, (list, tuple)): initial_states = [initial_states] base_initial_state = self.layer.get_initial_state(x) if len(base_initial_state) != len(initial_states): raise ValueError( "initial_state does not have the correct length. Received length {0} " "but expected {1}".format(len(initial_states), len(base_initial_state))) else: # check the state' shape for i in range(len(initial_states)): # initial_states[i][j] != base_initial_state[i][j]: if not initial_states[i].shape.is_compatible_with( base_initial_state[i].shape): raise ValueError( "initial_state does not match the default base state of the layer. " "Received {0} but expected {1}".format( [x.shape for x in initial_states], [x.shape for x in base_initial_state])) else: initial_states = self.layer.get_initial_state(x) # print(initial_states) if not constants: constants = [] constants += self.get_constants(x) last_output, outputs, states = K.rnn( self.step, x, initial_states, go_backwards=self.layer.go_backwards, mask=mask, constants=constants, unroll=self.layer.unroll, input_length=input_shape[1]) if self.layer.stateful: self.updates = [] for i in range(len(states)): self.updates.append((self.layer.states[i], states[i])) if self.layer.return_sequences: output = outputs else: output = last_output # Properly set learning phase if getattr(last_output, '_uses_learning_phase', False): output._uses_learning_phase = True for state in states: state._uses_learning_phase = True if self.layer.return_state: if not isinstance(states, (list, tuple)): states = [states] else: states = list(states) return [output] + states else: return output
def call(self, inputs, verbose=False): encoder_out_seq, decoder_out_seq = inputs values = encoder_out_seq keys = self.memory_layer(values) if self.memory_layer else values def energy_step(query, states): previous_alignments = states[0] if self.rnn_cell: c_i = states[1] cell_state = states[2:] lstm_input = K.concatenate([query, c_i]) lstm_input = K.expand_dims(lstm_input, 1) lstm_out = self.rnn_cell(lstm_input, initial_state=cell_state) lstm_output, new_cell_state = lstm_out[0], lstm_out[1:] query = lstm_output processed_query = self.query_layer( query) if self.query_layer else query expanded_alignments = K.expand_dims(previous_alignments, axis=2) f = self.location_convolution(expanded_alignments) processed_location_features = self.location_layer(f) e_i = K.sum( self.v_a * K.tanh(keys + processed_query + processed_location_features + self.b_a), [2]) e_i = K.softmax(e_i) if self._cumulate: next_state = e_i + previous_alignments else: next_state = e_i if self.rnn_cell: new_c_i, _ = context_step(e_i, [c_i]) return e_i, [next_state, new_c_i, *new_cell_state] return e_i, [next_state] def context_step(inputs, states): alignments = inputs expanded_alignments = K.expand_dims(alignments, 1) c_i = math_ops.matmul(expanded_alignments, values) c_i = K.squeeze(c_i, 1) return c_i, [c_i] def create_initial_state(inputs, hidden_size): fake_state = K.zeros_like(inputs) fake_state = K.sum(fake_state, axis=[1, 2]) fake_state = K.expand_dims(fake_state) fake_state = K.tile(fake_state, [1, hidden_size]) return fake_state def get_fake_cell_input(fake_state_c): fake_input = K.zeros_like(decoder_out_seq)[:, 0, :] fake_input = K.concatenate([fake_state_c, fake_input]) fake_input = K.expand_dims(fake_input, 1) return fake_input fake_state_c = create_initial_state(values, values.shape[-1]) fake_state_e = create_initial_state(values, K.shape(values)[1]) if self.rnn_cell: cell_initial_state = self.rnn_cell.get_initial_state( get_fake_cell_input(fake_state_c)) initial_states_e = [ fake_state_e, fake_state_c, *cell_initial_state ] else: initial_states_e = [fake_state_e] last_out, e_outputs, _ = K.rnn(energy_step, decoder_out_seq, initial_states_e) c_outputs = math_ops.matmul(e_outputs, values) return [c_outputs, e_outputs]
def call(self, inputs, verbose=False, mask=None): """ inputs: [encoder_output_sequence, decoder_output_sequence] """ assert type(inputs) == list # 注意,encoder_out_seq是一个数组,长度是seq;decoder_out_seq是一个输出。 encoder_out_seq, decoder_out_seq = inputs encoder_out_seq = _p_shape(encoder_out_seq, "注意力调用:入参编码器输出序列:encoder_out_seq") decoder_out_seq = _p_shape(decoder_out_seq, "注意力调用:入参解码器输出序列:decoder_out_seq") # 实现了能量函数,e_tj=V * tanh ( W * h_j + U * S_t-1 + b ) # inputs,我理解就是所有的h_j,错!我之前理解错了,这个参数是指某个时刻t,对应的输入!不是所有,是某个时刻的输入。 # 按为什么还有个s,input+s,是因为batch。 # states,我理解就是S_t-1 # decode_outs是不包含seq的,不是一个decode_out_seq,而是decode_out,为何加s呢,是因为batch # 但是每一步都是encoder_out_seq全都参与运算的, # decoder_out一个和encoder_out_seq一串,对 def energy_step(decode_outs, states): # decode_outs(batch,dim) decode_outs = _p(decode_outs, "energy_step:decode_outs 算能量函数了.........." ) #decode_outs:[1,20] # decoder_seq [N,30,512] 30是字符串长度 en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] # 30, 512 de_hidden = decode_outs.shape[-1] # W * h_j reshaped_enc_outputs = K.reshape( encoder_out_seq, (-1, en_hidden)) #[b,64,512]=> [b*64,512] _p(reshaped_enc_outputs, "reshaped_enc_outputs") # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512] W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512] => [b,1,512] U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a), axis=1) # <= batch_size, 1, latent_dim # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程 # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512] reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64] e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) # softmax(e_tj) e_i = K.softmax(e_i) e_i = _p(e_i, "energy_step:e_i") return e_i, [e_i] # 这个step函数有意思,特别要关注他的入参: # encoder_out_seq: 编码器的各个time sequence的输出h_i,[batch,ts,dim] # states: # inputs:某个时刻,这个rnn的输入,这里,恰好是之前那个能量函数eij对应这个时刻的概率 # ---------------------------- # "step_do 这个函数,这个函数接受两个输入:step_in 和 states。 # 其中 step_in 是一个 (batch_size, input_dim) 的张量, # 代表当前时刻的样本 xt,而 states 是一个 list,代表 yt−1 及一些中间变量。" # e 是30次中的一次,他是一个64维度的概率向量 def context_step(e, states): # e (batch,dim),其实每个输入就是一个e e = _p(e, "context_step:e") states = _p(states, "context_step:states") # encoder_out_seq[b,64,512] * e[64,1] # dot是矩阵相乘,*是对应位置元素相乘 # [b,64,512] * e[64,1]shape不一样,居然也可以乘,我试了,没问题 # 其实,就是实现了encoder ouput根据softmax概率分布,加权求和 c_i = K.sum(encoder_out_seq * K.expand_dims(e, -1), axis=1) c_i = _p(c_i, "context_step:c_i,算h的期望,也就是注意力了---------------------\n") return c_i, [c_i] # (batch_size, enc_seq_len, latent_dim) (b,64,512) # => (batch_size, hidden_size) # 这个函数是,作为GRU的初始状态值, def create_inital_state(inputs, hidden_size): # hidden_size=64 # print("inputs",inputs) # print("hidden_size",hidden_size) # print("type(hidden_size)", type(hidden_size)) # We are not using initial states, but need to pass something to K.rnn funciton fake_state = K.zeros_like( inputs) # [b,64,512]<= (batch_size, enc_seq_len, latent_dim) fake_state = K.sum(fake_state, axis=[1, 2]) # <= (batch_size) fake_state = K.expand_dims(fake_state) # <= (batch_size, 1) # print(fake_state) # print("------") # print(tf.shape(fake_state)) # print("hidden_size:",hidden_size) fake_state = tile( fake_state, [1, hidden_size]) # <= (batch_size, latent_dim) (b,64) return fake_state # encoder_out_seq = (batch_size, enc_seq_len, latent_dim) # fake_state_c == (batch_size, latent_dim) # fake_state_e == (batch_size, enc_seq) , 最后这个维度不好理解,其实就是attention模型里面的decoder对应的每个步骤的attention这个序列,是一个值 # K.rnn(计算函数,输入x,初始状态): K.rnn 这个函数,接受三个基本参数,其中第一个参数就是刚才写好的 step_do 函数,第二个参数则是输入的时间序列,第三个是初始态 # 这个rnn就是解码器,输入 eji=a(s_i-1,hj),其中j要遍历一遍,这个k.rnn就是把每个hj对应的eij都计算一遍 # 输出e_outputs,就是一个概率序列 # eij(i不变,j是一个encoder的h下标),灌入到一个新的rnn中,让他计算出对应的输出,这个才是真正的Decoder!!! shape = encoder_out_seq.shape.as_list() # print("encoder_out_seq.shape:",shape) # shape[1]是seq=64,序列长度 fake_state_e = create_inital_state( encoder_out_seq, shape[1] ) # encoder_out_seq.shape[1]) , fake_state_e (batch,enc_seq_len) fake_state_e = _p_shape(fake_state_e, "fake_state_e") # 输出是一个e的序列,是对一个时刻而言的 ########### ########### ########### K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn| # 这个步骤是做了30次(decoder,也就是字符串长度),每次得到一个64维度(encoder的time_sequence)的概率向量 last_out, e_outputs, _ = K.rnn( step_function=energy_step, inputs=decoder_out_seq, initial_states=[fake_state_e ], # (bx64)decoder_out_seq是一个序列,不是一个单个值 ) # e_outputs [30,64] e_outputs = _p_shape(e_outputs, "能量函数e输出::::") # shape[-1]是encoder的隐含层 fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1]) # fake_state_c = _p_shape(fake_state_c, "fake_state_c") # print("e_outputs:", e_outputs) ########### ########### ########### K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn| last_out, c_outputs, _ = K.rnn( # context_step算注意力的期望,sum(eij*encoder_out), 输出的(batch,encoder_seq,) step_function=context_step, inputs=e_outputs, initial_states=[fake_state_c], ) #c_outputs [b,64,512] c_outputs = _p_shape(c_outputs, "注意力c输出::::") # 输出: # 注意力c_outputs的向量(batch,图像seq,512), # 注意力e_outputs的向量(batch,图像seq,图像宽度/4), return c_outputs, e_outputs
def call(self, inputs, verbose=False): """ inputs: [encoder_output_sequence, decoder_output_sequence] """ assert type(inputs) == list encoder_out_seq, decoder_out_seq = inputs if verbose: print('encoder_out_seq>', encoder_out_seq.shape) print('decoder_out_seq>', decoder_out_seq.shape) def energy_step(inputs, states): """ Step function for computing energy for a single decoder state """ assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg """ Some parameters required for shaping tensors""" en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] """ Computing S.Wa where S=[s0, s1, ..., si]""" # <= batch_size*en_seq_len, latent_dim reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden)) # <= batch_size*en_seq_len, latent_dim W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) if verbose: print('wa.s>', W_a_dot_s.shape) """ Computing hj.Ua """ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim if verbose: print('Ua.h>', U_a_dot_h.shape) """ tanh(S.Wa + hj.Ua) """ # <= batch_size*en_seq_len, latent_dim reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) if verbose: print('Ws+Uh>', reshaped_Ws_plus_Uh.shape) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # <= batch_size, en_seq_len e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) # <= batch_size, en_seq_len e_i = K.softmax(e_i) if verbose: print('ei>', e_i.shape) return e_i, [e_i] def context_step(inputs, states): """ Step function for computing ci using ei """ # <= batch_size, hidden_size c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1) if verbose: print('ci>', c_i.shape) return c_i, [c_i] def create_inital_state(inputs, hidden_size): # We are not using initial states, but need to pass something to K.rnn funciton fake_state = K.zeros_like( inputs) # <= (batch_size, enc_seq_len, latent_dim fake_state = K.sum(fake_state, axis=[1, 2]) # <= (batch_size) fake_state = K.expand_dims(fake_state) # <= (batch_size, 1) fake_state = K.tile(fake_state, [1, hidden_size]) # <= (batch_size, latent_dim return fake_state fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1]) fake_state_e = create_inital_state( encoder_out_seq, encoder_out_seq.shape[1] ) # <= (batch_size, enc_seq_len, latent_dim """ Computing energy outputs """ # e_outputs => (batch_size, de_seq_len, en_seq_len) last_out, e_outputs, _ = K.rnn( energy_step, decoder_out_seq, [fake_state_e], ) """ Computing context vectors """ last_out, c_outputs, _ = K.rnn( context_step, e_outputs, [fake_state_c], ) return c_outputs, e_outputs
def call(self, inputs, verbose=False): """ inputs: [encoder_output_sequence, decoder_output_sequence] """ assert type(inputs) == list encoder_out_seq, decoder_out_seq = inputs if verbose: print('encoder_out_seq>', encoder_out_seq.shape) print('decoder_out_seq>', decoder_out_seq.shape) def energy_step(inputs, states): """ Step function for computing energy for a single decoder state inputs: (batchsize * 1 * de_in_dim) states: (batchsize * 1 * de_latent_dim) """ assert_msg = "States must be an iterable. Got {} of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg """ Some parameters required for shaping tensors""" en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] """ Computing S.Wa where S=[s0, s1, ..., si]""" # <= batch size * en_seq_len * latent_dim W_a_dot_s = K.dot(encoder_out_seq, self.W_a) """ Computing hj.Ua """ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim if verbose: print('Ua.h>', U_a_dot_h.shape) """ tanh(S.Wa + hj.Ua) """ # <= batch_size*en_seq_len, latent_dim Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h) if verbose: print('Ws+Uh>', Ws_plus_Uh.shape) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # <= batch_size, en_seq_len e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1) # <= batch_size, en_seq_len e_i = K.softmax(e_i) if verbose: print('ei>', e_i.shape) return e_i, [e_i] def context_step(inputs, states): """ Step function for computing ci using ei """ assert_msg = "States must be an iterable. Got {} of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg # <= batch_size, hidden_size c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1) if verbose: print('ci>', c_i.shape) return c_i, [c_i] fake_state_c = K.sum(encoder_out_seq, axis=1) fake_state_e = K.sum(encoder_out_seq, axis=2) # <= (batch_size, enc_seq_len, latent_dim """ Computing energy outputs """ # e_outputs => (batch_size, de_seq_len, en_seq_len) last_out, e_outputs, _ = K.rnn( energy_step, decoder_out_seq, [fake_state_e], ) """ Computing context vectors """ last_out, c_outputs, _ = K.rnn( context_step, e_outputs, [fake_state_c], ) return c_outputs, e_outputs