def _step(self, x_tm1, h_tm1, c_tm1, H, u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, w_a, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x, b_a): s_tm1 = K.repeat(c_tm1, self.input_length) e = H + s_tm1 def a(x, states): output = K.dot(x, w_a) + b_a return output, [] _, energy, _ = K.rnn(a, e, [], mask=None) energy = activations.get('linear')(energy) energy = K.permute_dimensions(energy, (2, 0, 1)) energy = energy[0] alpha = K.softmax(energy) alpha = K.repeat(alpha, self.hidden_dim) alpha = K.permute_dimensions(alpha, (0, 2 , 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i)) f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f)) c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c)) o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o)) h_t = o_t * self.activation(c_t) x_t = K.dot(h_t, w_x) + b_x return x_t, h_t, c_t
def step(self, x, states): M = states[0] # (nb_samples, nb_slots, memory_size) h = states[1] # (nb_samples, memory_size) w = states[2] # (nb_samples, nb_slots) #------Memory read--------# k = self.W_k(h) # (nb_samples, memory_size) w_hat = T.batched_tensordot(M, k, axes=[(2), (1)]) # (nb_samples, nb_slots) beta = K.sigmoid(self.W_b(h)) # (nb_samples, 1) beta = K.repeat(beta, self.nb_slots) # (nb_samples, nb_slots, 1) beta = K.squeeze(beta, 2) # (nb_samples, nb_slots) w_hat = softmax(w_hat * beta) # (nb_samples, nb_slots) g = sigmoid(self.W_hg(h)) # (nb_samples, 1) g = K.repeat(g, self.nb_slots) # (nb_samples, nb_slots, 1) g = K.squeeze(g, 2) # (nb_samples, nb_slots) w = (1 - g) * w + g * w_hat # (nb_samples, nb_slots) c = T.batched_tensordot(w, M, axes=[(1), (1)]) h = tanh(self.W_ih(x) + self.W_c(c)) y = self.W_ho(h) #---------Memory write---------# v = self.W_v(h) # (nb_samples, memory_size) v = K.repeat(v, 1) e = sigmoid(self.W_he(h)) # (nb_samples, nb_slots) f = 1 - w * e # (nb_samples, nb_slots) f = K.repeat(f, self.memory_size) # (nb_samples, memory_size, nb_slots) f = K.permute_dimensions(f, (0, 2, 1)) # (nb_samples, nb_slots, memory_size) u = w # (nb_samples, nb_slots) u = K.repeat(u, 1) uv = T.batched_tensordot(u, v, axes=[(1), (1)]) M = M * f + uv return y, [M, h, w]
def set_batch_function(self, model, input_shape, batch_size, nb_actions, gamma): input_dim = np.prod(input_shape) samples = K.placeholder(shape=(batch_size, input_dim * 2 + 3)) S = samples[:, 0 : input_dim] a = samples[:, input_dim] a = K.cast(a, '') r = samples[:, input_dim + 1] S_prime = samples[:, input_dim + 2 : 2 * input_dim + 2] game_over = samples[:, 2 * input_dim + 2 : 2 * input_dim + 3] r = K.reshape(r, (batch_size, 1)) r = K.repeat(r, nb_actions) r = K.reshape(r, (batch_size, nb_actions)) game_over = K.repeat(game_over, nb_actions) game_over = K.reshape(game_over, (batch_size, nb_actions)) S = K.reshape(S, (batch_size, ) + input_shape) S_prime = K.reshape(S_prime, (batch_size, ) + input_shape) X = K.concatenate([S, S_prime], axis=0) Y = model(X) Qsa = K.max(Y[batch_size:], axis=1) Qsa = K.reshape(Qsa, (batch_size, 1)) Qsa = K.repeat(Qsa, nb_actions) Qsa = K.reshape(Qsa, (batch_size, nb_actions)) delta = K.reshape(self.one_hot(a, nb_actions), (batch_size, nb_actions)) targets = (1 - delta) * Y[:batch_size] + delta * (r + gamma * (1 - game_over) * Qsa) self.batch_function = K.function(inputs=[samples], outputs=[S, targets])
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1-zt)*stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def step(self, x, states): """ LSTM的几个表达式都在这 :param x: :param states: 上个时刻的输出和隐层状态st :return: """ ytm, stm = states # repeat the hidden state to the length of the sequence # 按照steps的维度重复n次,(sample, step, dim) _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) # softmax at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def sample_z(args): k = 5 local_mu, local_sigma = args local_mu = K.repeat(local_mu, k) local_sigma = K.repeat(local_sigma, k) eps = K.random_normal(shape=(K.shape(local_mu)[0], k, K.shape(local_mu)[2]), mean=0., stddev=1.) return local_mu + local_sigma * eps
def call(self, x): w = x[:,0] m0 = x[:,9] distance = k_b.square(w - m0) distance = k_b.sqrt(distance[:,0] + distance[:,1] + distance[:,2]) distance = k_b.reshape(distance, (-1, 1)) distance = k_b.repeat(distance, 21) m0 = k_b.repeat(m0, 21) result = (x-m0)/distance return result
def correlation_loss(y_true, y_pred): # want to maximize correlation y_true, y_pred = K.reshape(y_true, (-1, WRAP, 20)), K.reshape( y_pred, (-1, WRAP, 20)) mx = K.repeat(K.mean(y_true, axis=1), WRAP) my = K.repeat(K.mean(y_pred, axis=1), WRAP) xm, ym = y_true - mx, y_pred - my r_num = K.sum(xm * ym, axis=1) r_den = K.sum(K.sum(K.square(xm), axis=1) * K.sum(K.square(ym), axis=1)) r = r_num / r_den return 1 - r
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps_e) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot( activations.tanh( _Wxstm + self._uxpb), #e_ij = a(s_(i-1),h_j) where h_j = self._uxpb K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps_e) at /= at_sum_repeated # Eq(6) vector of size (batchsize, timesteps, 1) , softmax:length timesteps from stm # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) #Eq(5) length :batchsize*input_dim # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # f_t 对应lstm遗忘门,此处没有x_t输入 # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) # h_t if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def log_prob(self, x): """Given batch of x of shape (batch, samples, dim), returns (batch, samples) values of the log probability per sample. """ # log gaussian probability = -1/2 sum[(x-mean)^2/variance] variance = K.repeat(K.exp(self.log_var), self.k_samples) # shape is (batch, samples, dim) log_det = K.tile(K.sum(self.log_var, axis=-1, keepdims=True), (1, self.k_samples)) # shape is (batch, samples) x_diff = x - K.repeat(self.mean, self.k_samples) # shape is (batch, samples, dim) return -(K.sum((x_diff / variance) * x_diff, axis=-1) + log_det) / 2
def call(self, x, mask=None): en_seq = x[0] de_seq = x[1] input_de_times = K.int_shape(de_seq)[-2] if len(x) == 3: mask = x[2] m_en = K.cast(mask, K.floatx()) en_seq = en_seq * K.expand_dims(m_en, -1) if len(x) == 2 and mask is not None: # remove padding values m_en = K.cast(mask[0], K.floatx()) en_seq = en_seq * K.expand_dims(m_en, -1) # compute alphas att_en = K.dot(K.reshape(en_seq, (-1, self.input_dim_en)), self.w_en) att_en = K.reshape(att_en, shape=(-1, self.input_en_times * self.units)) att_en = K.repeat(att_en, input_de_times) att_en = K.reshape(att_en, shape=(-1, self.input_en_times * input_de_times, self.units)) att_de = K.dot(K.reshape(de_seq, (-1, self.input_dim_de)), self.w_de) att_de = K.reshape(att_de, shape=(-1, input_de_times, self.units)) att_de = K.repeat_elements(att_de, self.input_en_times, 1) co_m = att_en + att_de co_m = K.reshape(co_m, (-1, self.units)) mu = K.dot(K.tanh(co_m), self.nu) if len(x) == 3 or (len(x) == 2 and mask is not None): m_en = K.repeat(m_en, input_de_times) m_en = K.reshape(m_en, shape=(-1, 1)) m_en = m_en - 1 m_en = m_en * REMOVE_FACTOR mu = mu + m_en mu = K.reshape(mu, shape=(-1, input_de_times, self.input_en_times)) alphas = K.softmax(mu) en_seq = K.reshape(en_seq, shape=(-1, self.input_en_times * self.input_dim_en)) en_seq = K.repeat(en_seq, input_de_times) en_seq = K.reshape(en_seq, shape=(-1, input_de_times, self.input_en_times, self.input_dim_en)) sum_en = K.sum(en_seq * K.expand_dims(alphas, -1), 2) output = K.concatenate([de_seq, sum_en], -1) if self.return_alphas: return [output, alphas] else: return output
def estimated(self, state, batch_size): # print(state.shape) # batch_size = state.shape[0] # generator_mag = K.ones((batch_size, 3)) # ang_ref = K.zeros((batch_size, 1)) # ref_ang = tf.Variable(tf.zeros((batch_size, 1))) # state = K.concatenate([generator_mag, state[:, :6], ang_ref, state[:, 6:]], axis=-1) # print(state.shape) state_restore = (state + 1) * (max_state - min_state) / 2 + min_state V = state_restore[:, :self.num_bus] * 10 # [k, 9] A = state_restore[:, self.num_bus:] # [k, 9] # print(V.shape, A.shape) # P_bus = K.zeros((A.shape[0], 9)) # [k, 9] # Q_bus = K.zeros((A.shape[0], 9)) # [k, 9] # A_ # print(K.permute_dimensions(K.repeat(A, 9), [0, 2, 1]).shape) # print(K.repeat(A, 9).shape) A_ = K.permute_dimensions(K.repeat(A, self.num_bus), [0, 2, 1]) - K.repeat(A, self.num_bus) G = K.constant(self.G, dtype=tf.float32) B = K.constant(self.B, dtype=tf.float32) cos_ = K.cos(A_ * pi / 180) sin_ = K.sin(A_ * pi / 180) term_1_P = G * cos_ + B * sin_ term_1_Q = G * sin_ - B * cos_ P_bus = (V * K.batch_dot(V, term_1_P, axes=[1, 2])) Q_bus = (V * K.batch_dot(V, term_1_Q, axes=[1, 2])) P_idx = [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13] Q_idx = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13] batch_estimated_measurement = K.concatenate([self.gather_cols(P_bus, P_idx), self.gather_cols(Q_bus, Q_idx)], axis=1) # print(batch_estimated_measurement.shape) # batch_estimated_measurement = K.concatenate([P_bus, Q_bus], axis=1) ans = (batch_estimated_measurement - min_meas) / (max_meas - min_meas) * 2 - 1 # print(K.eval(ans)) return ans
def cnn_loss(x, x_decoded_mean): #N = tf.convert_to_tensor(DPParam, dtype=tf.float32) gamma = tf.convert_to_tensor(DPParam['LPMtx'], dtype=tf.float32) N = tf.convert_to_tensor(DPParam['Nvec'], dtype=tf.float32) m = tf.convert_to_tensor(DPParam['m'], dtype=tf.float32) W = tf.convert_to_tensor(DPParam['B'], dtype=tf.float32) v = tf.convert_to_tensor(DPParam['nu'], dtype=tf.float32) num_cluster = N.shape[0] z_mean_1_last = tf.expand_dims(z_mean, -1) # bs, latent_dim, 1 z_mean_1_mid = tf.expand_dims(z_mean, 1) # bs, 1, latent_dim for k in range(num_cluster): gamma_k_rep = tf.squeeze( K.repeat(tf.expand_dims(gamma[:, k], -1), latent_dim)) z_k_bar = 1 / N[k] * K.sum(tf.multiply(gamma_k_rep, z_mean), axis=0) #(latent_dim, ) z_k_bar_batch = tf.squeeze( K.repeat(tf.expand_dims(z_k_bar, 0), batch_size)) #tf.transpose(z_k_bar_batch, perm=[1, 0]) z_k_bar_batch_1_last = tf.expand_dims(z_k_bar_batch, -1) # bs, latent_dim, 1 z_k_bar_batch_1_mid = tf.expand_dims(z_k_bar_batch, 1) # bs, 1, latent_dim # TODO:! S_k = 1 / N[k] * K.sum(K.batch_dot( tf.multiply(tf.expand_dims(gamma_k_rep, -1), (z_mean_1_last - z_k_bar_batch_1_last)), z_mean_1_mid - z_k_bar_batch_1_mid), axis=0) # (latent_dim, latent_dim) temp = tf.linalg.trace(tf.linalg.solve(W[k], S_k)) temp2 = tf.matmul(tf.expand_dims((z_k_bar - m[k]), 0), tf.linalg.inv(W[k])) temp3 = tf.squeeze( tf.matmul(temp2, tf.expand_dims((z_k_bar - m[k]), -1))) if k == 0: e = 0.5 * N[k] * (v[k] * (temp + temp3)) else: e += 0.5 * N[k] * (v[k] * (temp + temp3)) loss_ = alpha * original_dim * objectives.mean_squared_error( K.flatten(x), K.flatten(x_decoded_mean)) - scale * K.sum( (z_log_var + 1), axis=-1) loss_ = K.sum(loss_, axis=0) + e # loss = K.sum(loss_, axis = 0) #for i in range(5): # loss_ += N #return loss_ return loss_
def myDist(y_pred): y_pred1, y_pred2, sth = y_pred norm1 = K.sqrt(K.sum(y_pred1**2, axis=1)) norm1 = K.reshape(norm1, (norm1.shape[0], 1)) norm1 = K.reshape(K.repeat(norm1, y_pred1.shape[1]), y_pred1.shape) y_pred1 = y_pred1 / norm1 norm2 = K.sqrt(K.sum(y_pred2**2, axis=1)) norm2 = K.reshape(norm2, (norm2.shape[0], 1)) norm2 = K.reshape(K.repeat(norm2, y_pred2.shape[1]), y_pred2.shape) y_pred2 = y_pred2 / norm2 return K.switch(K.dot(y_pred1, y_pred2.T) > sth, 1, 0)
def step(self, x, states): h_tm1, c_tm1, y_tm1, B, U, H = states s = K.dot(c_tm1, self.W_h) + self.b_h s = K.repeat(s, self.input_length) energy = time_distributed_dense(s + H, self.W_a, self.b_a) energy = K.squeeze(energy, 2) alpha = K.softmax(energy) alpha = K.repeat(alpha, self.input_dim) alpha = K.permute_dimensions(alpha, (0, 2, 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) y, new_states = super(AttentionDecoder, self).step(v, states[:-1]) return y, new_states
def call(self, x, mask=None): en_seq = x[0] de_seq = x[1] topics = x[2] input_de_times = K.shape(de_seq)[-2] # compute alphas att_en = K.dot(K.reshape(en_seq, (-1, self.input_dim_en)), self.w_en) att_en = K.reshape(att_en, shape=(-1, self.input_en_times * self.units)) att_en = K.repeat(att_en, input_de_times) att_en = K.reshape(att_en, shape=(-1, self.input_en_times * input_de_times, self.units)) att_de = K.dot(K.reshape(de_seq, (-1, self.input_dim_de)), self.w_de) att_de = K.reshape(att_de, shape=(-1, input_de_times, self.units)) att_de = K.repeat_elements(att_de, self.input_en_times, 1) topics_w = K.dot(topics, K.transpose(self.wt)) topics_w = K.repeat(topics_w, self.input_en_times * input_de_times) # print("Here:", att_de, att_en, topics_w) co_m = att_en + att_de + topics_w co_m = K.reshape(co_m, (-1, self.units)) mu = K.dot(K.tanh(co_m), self.nu) mu = K.reshape(mu, shape=(-1, input_de_times, self.input_en_times)) alphas = K.softmax(mu) p_gen = K.sigmoid(mu) en_seq = K.reshape(en_seq, shape=(-1, self.input_en_times * self.input_dim_en)) en_seq = K.repeat(en_seq, input_de_times) en_seq = K.reshape(en_seq, shape=(-1, input_de_times, self.input_en_times, self.input_dim_en)) sum_en = K.sum(en_seq * K.expand_dims(alphas, -1), 2) # output = K.concatenate([de_seq, sum_en], -1) output = de_seq + sum_en if self.return_alphas: alphas = K.reshape(alphas, shape=(-1, input_de_times, self.input_en_times)) p_gen = K.reshape(p_gen, shape=(-1, input_de_times, self.input_en_times)) return [output] + [alphas] + [p_gen] else: return output
def call(self, inputs, mask=None): if isinstance(inputs, list): memory, aspect = inputs mask = mask[0] else: memory = inputs attend_weights = [] batch_size = K.shape(memory)[0] time_steps = K.shape(memory)[1] e = K.zeros(shape=(batch_size, self.units)) for h in range(self.n_hop): # compute attention weight repeat_e = K.repeat(e, time_steps) if isinstance(inputs, list): repeat_asp = K.repeat(aspect, time_steps) inputs_concat = K.concatenate([memory, repeat_asp, repeat_e], axis=-1) else: inputs_concat = K.concatenate([memory, repeat_e], axis=-1) g = K.squeeze(K.dot(inputs_concat, self.al_w[h]), axis=-1) + self.al_b[h] # [batch_size, time_steps] a = K.exp(g) # apply mask after the exp. will be re-normalized next if mask is not None: a *= K.cast(mask, K.floatx()) a /= K.cast( K.sum(a, axis=-1, keepdims=True) + K.epsilon(), K.floatx()) attend_weights.append(a) # apply attention a_expand = K.expand_dims(a) # [batch_size, time_steps, 1] i_AL = K.sum( memory * a_expand, axis=1 ) # [batch_size, hidden], i_AL is the input of gru at time `h` # gru implementation r = K.sigmoid(K.dot(i_AL, self.gru_wr) + K.dot(e, self.gru_ur)) # reset gate z = K.sigmoid(K.dot(i_AL, self.gru_wz) + K.dot(e, self.gru_uz)) # update gate _e = K.tanh(K.dot(i_AL, self.gru_wx) + K.dot(r * e, self.gru_wg)) e = (1 - z) * e + z * _e # update e if self.return_attend_weight: return [e, K.concatenate(attend_weights, axis=0)] else: return e
def iwae_loss(y_true, y_pred): local_mu = K.repeat(mu, k) local_sigma = K.repeat(sigma, k) log_posterior = -(n_z / 2) * log2pi - K.sum( K.log(1e-8 + local_sigma) + 0.5 * K.square(z - local_mu) / K.square(1e-8 + local_sigma), axis=-1) log_prior = -(n_z / 2) * log2pi - K.sum(0.5 * K.square(z), axis=-1) log_bernoulli = K.sum(y_true * K.log(y_pred + 1e-8) + (1 - y_true) * K.log(1 - y_pred + 1e-8), axis=-1) log_weights = log_bernoulli + log_prior - log_posterior importance_weight = K.softmax(log_weights, axis=1) return -K.sum(importance_weight * log_weights, axis=-1)
def step(self, x, states): """ get the previous hidden state of the decoder from states = [z, s_p] alignment model: waStm1 = W_a \dot s_{t-1} uaHt = U_a \dot h_t tmp = tanh(waStm1 + uaHt) e_ij = V_a^T * tmp vector of length = timestep is: u_t = softmax(e_tj) """ atm1 = x ztm1, s_tpm1 = states # old hidden state: # shape (batchsize, units) stm1 = (1 - ztm1) * self.stm2 + ztm1 * s_tpm1 # shape (batchsize, timesteps, units) _stm = K.repeat(stm1, self.timesteps) # shape (batchsize, timesteps, output_dim) _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities: # self._uxpb has shape (batchsize, timesteps, output_dim) # V_a has shape (output_dim, ) # after K.expand_dims it is (output_dim, 1) # therefore et has shape (batchsize, timesteps, 1) et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of shape (batchsize, timesteps, 1) # reset gate: rt = activations.sigmoid( K.dot(atm1, self.W_r) + K.dot(stm1, self.U_r) + self.b_r) # update gate: zt = activations.sigmoid( K.dot(atm1, self.W_z) + K.dot(stm1, self.U_z) + self.b_z) # proposal hidden state: s_tp = activations.tanh( K.dot(atm1, self.W_p) + K.dot((rt * stm1), self.U_p) + self.b_p) yt = activations.softmax(at) if self.return_probabilities: return at, [zt, s_tp] else: return yt, [zt, s_tp]
def step(self, x, states): # obtain elements of the previous time step. zt, htm = states if self.idx < self.timesteps: self.idx+=1 # ## ## ## equation 1 ## ## ## ## ## # repeat the hidden state to the length of the sequence _htm = K.repeat(htm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxhtm = K.dot(_htm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxhtm + self._uxpb), K.expand_dims(self.V_a)) ## ## ## equation 2 ## ## ## ## ## at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) ## ## ## equation 3 ## ## ## ## ## # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # equation 4 (zt) zt=K.concatenate([context,htm],axis=-1) zt=activations.tanh(K.dot(zt, self.W_A_combine)) # print('int_shape: ', K.int_shape(zt)) # a switch so that we can return the # attention for visualizations htm = activations.tanh(K.dot(self.x_seq[:, self.idx], self.W_s)) if self.return_probabilities: return at, [zt, htm] else: return zt, [zt, htm]
def call(self, x): assert isinstance(x, list) h1, h2 = x c = list() for i in range(self.seq_len): h2_i = K.repeat(h2[:, i, :], self.seq_len) x = K.concatenate([h1, h2_i]) p = K.tanh(K.dot(x, self.w) + self.b1) p = K.softmax(K.dot(p, self.v) + self.b2) p = K.squeeze(p, axis=-1) p = K.repeat(p, self.embed_len) p = K.permute_dimensions(p, (0, 2, 1)) c_i = K.sum(p * h1, axis=1, keepdims=True) c.append(c_i) return K.concatenate(c, axis=1)
def vae_loss(x, x_decoded_mean, z, z_mean, z_log_var, u_p, theta_p, lambda_p, alpha=1, datatype='sigmoid'): Z = tf.transpose(K.repeat(z, n_centroid), [0, 2, 1]) z_mean_t = tf.transpose(K.repeat(z_mean, n_centroid), [0, 2, 1]) z_log_var_t = tf.transpose(K.repeat(z_log_var, n_centroid), [0, 2, 1]) u_tensor3 = tf.tile(tf.expand_dims(u_p, [0]), [batch_size, 1, 1]) # u_tensor3 = T.repeat(tf.expand_dims(u_p,[0]), batch_size, axis=0) # lambda_tensor3 = T.repeat(tf.expand_dims(lambda_p,[0]), batch_size, axis=0) lambda_tensor3 = tf.tile(tf.expand_dims(lambda_p, [0]), [batch_size, 1, 1]) temp_theta_p = tf.expand_dims(theta_p, [0]) temp_theta_p = tf.expand_dims(temp_theta_p, [0]) # theta_tensor3 = temp_theta_p * T.ones((batch_size, z_dim, n_centroid)) theta_tensor3 = tf.tile(temp_theta_p, [batch_size, z_dim, 1]) #@TODO #PROBLEM HERE ? add theta z_dim times for each cluster? p_c_z = K.exp(K.sum((K.log(theta_tensor3) - 0.5 * K.log(2 * math.pi * lambda_tensor3) - \ K.square(Z - u_tensor3) / (2 * lambda_tensor3)), axis=1)) + 1e-10 gamma = p_c_z / K.sum(p_c_z, axis=-1, keepdims=True) gamma_t = K.repeat(gamma, z_dim) if datatype == 'sigmoid': loss = alpha * original_dim * objectives.binary_crossentropy(x, x_decoded_mean) \ + K.sum(0.5 * gamma_t * ( z_dim * K.log(math.pi * 2) + K.log(lambda_tensor3) + K.exp(z_log_var_t) / lambda_tensor3 + K.square( z_mean_t - u_tensor3) / lambda_tensor3), axis=(1, 2)) \ - 0.5 * K.sum(z_log_var + 1, axis=-1) \ - K.sum(K.log(K.repeat_elements(tf.expand_dims(theta_p, [0]), batch_size, 0)) * gamma, axis=-1) \ + K.sum(K.log(gamma) * gamma, axis=-1) else: loss = alpha * original_dim * objectives.mean_squared_error(x, x_decoded_mean) \ + K.sum(0.5 * gamma_t * ( z_dim * K.log(math.pi * 2) + K.log(lambda_tensor3) + K.exp(z_log_var_t) / lambda_tensor3 + K.square( z_mean_t - u_tensor3) / lambda_tensor3), axis=(1, 2)) \ - 0.5 * K.sum(z_log_var + 1, axis=-1) \ - K.sum(K.log(K.repeat_elements(tf.expand_dims(theta_p, [0]), batch_size, 0)) * gamma, axis=-1) \ + K.sum(K.log(gamma) * gamma, axis=-1) return tf.reduce_mean(loss)
def call(self, inputs, mask=None): X, v = inputs mask_X, _ = mask if self.attend_mode == 'concat': concatenated = K.concatenate([X, K.repeat(v, X.shape[1])], axis=-1) e = dot_product(concatenated, self.W) if self.attend_mode == 'sum': e = dot_product(X, self.W) + dot_product(K.expand_dims(v, axis=1), self.M) if self.bias: e += self.b e = K.tanh(e) e = dot_product(e, self.u) a = K.exp(e) if mask_X is not None: a *= K.cast(mask_X, K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_sum = K.sum(X * a, axis=1) if self.return_coefficients: return weighted_sum, a else: return weighted_sum
def call(self, x, mask=None): #print(mask.shape) mask = K.cast(mask, 'float32') mask = K.repeat(mask, self.repeat_dim) #print(mask.shape) mask = K.permute_dimensions(mask, (0, 2, 1)) return x * mask
def call(self, x, mask=None): X = K.repeat(x, self.output_length) input_shape = list(self.input_spec[0].shape) input_shape = input_shape[:1] + [self.output_length] + input_shape[1:] self.input_spec = [InputSpec(shape=tuple(input_shape))] if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states[:] else: initial_states = self.get_initial_states(X) constants = self.get_constants(X) y_0 = K.permute_dimensions(X, (1, 0, 2))[0, :, :] initial_states += [y_0] last_output, outputs, states = K.rnn(self.step, X, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=self.output_length) if self.stateful and not self.state_input: self.updates = [] for i in range(2): self.updates.append((self.states[i], states[i])) self.states_to_transfer = states input_shape.pop(1) self.input_spec = [InputSpec(shape=input_shape)] return outputs
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None): # Apply y.w + b for every temporal slice y of x. print(x.shape) print(w.shape) if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.shape(w)[1] print(output_dim) print(timesteps) print(input_dim) if dropout: ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x *= expanded_dropout_matrix x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b: x = x + b x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, H, mask=None): # energy = self.activation(K.dot(x, self.W0)+self.b0) # energy=K.dot(energy, self.W) + self.b # energy = K.reshape(energy, (-1, self.input_length)) # energy = K.softmax(energy) # xx = K.batch_dot(energy,x, axes=(1, 1)) # all=K.concatenate([xx,energy]) # return all # H_t=K.permute_dimensions(H,(0,2,1)) #H is [none, n, hidden] ; H_t is [none, hidden, n] # temp=self.activation(K.permute_dimensions(K.dot(self.W1,H_t),(1,0,2))) #tanh(W1 . Ht) was [da, none, n], transpose to [none, da, n] # temp=K.permute_dimensions(K.dot(self.W2,temp),(1,0,2)) #W2 . tanh(W1 . Ht) was [r, none, n], transpose to [none, r, n] H1 = H[:, :, :-1] attention_mask = H[:, :, -1] #adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 H_t = self.activation(K.dot(H1, self.W1)) temp = K.permute_dimensions(K.dot(H_t, self.W2), (0, 2, 1)) # [?,r.n] #temp=K.square(temp)#make dis larger temp += K.repeat(adder, self.r) A = K.softmax(temp) # A [none, r, n] M = K.batch_dot(A, H1, axes=(2, 1)) # [none, r, hidden] if self.attention_regularizer_weight > 0.0: self.add_loss(self._attention_regularizer(A)) if self.return_attention: return [M, A] # all=K.concatenate([M,A]) #[none, r, hidden+n] return M
def TD(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, x): assert isinstance(x, list) a, b = x #print("input shape") #print(a.shape) #print(b.shape) #print("weight shape") temp_kernel = self.kernel #print(temp_kernel.shape) temp_kernel = K.reshape(temp_kernel, (temp_kernel.shape[1], temp_kernel.shape[0])) #print(temp_kernel.shape) temp_kernel = K.repeat(temp_kernel, max_length) #print(temp_kernel.shape) ext_kernel = temp_kernel #multiplying each time steps of first input with weight res1 = Multiply()([a, ext_kernel]) #multiplying each time steps of second input with weight res2 = Multiply()([b, ext_kernel]) #print(res1.shape) #print(res2.shape) #computing cosine similarity between each time steps of first input to ## each time steps of second input out = Dot(axes=2, normalize=True)([res1, res2]) #print(out.shape) return (out)
def call(self, x, mask=None): print("AttentionDecoder.call") H = x x = K.permute_dimensions(H, (1, 0, 2))[-1, :, :] if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states[:] else: initial_states = self.get_initial_states(H) constants = self.get_constants(H) + [H] y_0 = x x = K.repeat(x, self.output_length) initial_states += [y_0] last_output, outputs, states = K.rnn( self.step, x, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=self.output_length) if self.stateful and not self.state_input: self.updates = zip(self.states, states) self.states_to_transfer = states return outputs
def call(self, inputs, mask=None): rep_input1 = K.repeat( K.squeeze(inputs[1], axis=1), inputs[0].shape[1]) if inputs[1].shape[1] == 1 else inputs[1] conca_input = K.concatenate([inputs[0], rep_input1]) e = K.dot(conca_input, self.wt_mid) if self.use_bias: e += self.b_mid e = K.tanh(e) e = dot_product(e, self.wt_out) if self.use_bias: e += self.b_out e = isr(e, self.alpha) wt = K.exp(e) # apply mask after the exp. will be re-normalized next if mask is not None and mask[0] is not None: mask = mask[0] if mask[1] is None else mask[0] & mask[1] wt *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. wt /= K.sum(wt, axis=1, keepdims=True) + EPSILON else: wt /= K.sum(wt, axis=1, keepdims=True) # a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) weighted_input = inputs[0] * K.expand_dims(wt) return K.sum(weighted_input, axis=1, keepdims=self.keepdims)
def _loss_tensor(y_true, y_pred): max_val = K.max(y_pred,axis=-2) #temporal axis! max_val = K.repeat(max_val,K.shape(y_pred)[-2]) print(K.eval(max_val)) mask = K.cast(K.equal(max_val,y_pred),K.floatx()) y_pred = mask * y_pred + (1-mask) * y_true return squared_hinge(y_true,y_pred)
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None): '''Apply y.w + b for every temporal slice y of x. ''' if not input_dim: # won't work with TensorFlow input_dim = K.shape(x)[2] if not timesteps: # won't work with TensorFlow timesteps = K.shape(x)[1] if not output_dim: # won't work with TensorFlow output_dim = K.shape(w)[1] if dropout: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x *= expanded_dropout_matrix # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b: x = x + b # reshape to 3D tensor x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, x, mask=None): H = x x = K.permute_dimensions(H, (1, 0, 2))[-1, :, :] if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states[:] else: initial_states = self.get_initial_states(H) constants = self.get_constants(H) + [H] y_0 = x x = K.repeat(x, self.output_length) initial_states += [y_0] last_output, outputs, states = K.rnn(self.step, x, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=self.output_length) if self.stateful and not self.state_input: self.updates = [] for i in range(2): self.updates.append((self.states[i], states[i])) self.states_to_transfer = states return outputs
def call(self, x, mask=None): y = K.dot(x, self.att_W) if not self.activation: if K.backend() == 'theano': weights = K.theano.tensor.tensordot(self.att_v, y, axes=[0, 2]) elif K.backend() == 'tensorflow': weights = K.tensorflow.python.ops.math_ops.tensordot( self.att_v, y, axes=[0, 2]) elif self.activation == 'tanh': if K.backend() == 'theano': weights = K.theano.tensor.tensordot(self.att_v, K.tanh(y), axes=[0, 2]) elif K.backend() == 'tensorflow': weights = tf.tensordot(self.att_v, K.tanh(y), axes=[[0], [2]]) # weights = K.tensorflow.python.ops.math_ops.tensordot(self.att_v, K.tanh(y), axes=[0, 2]) weights = K.softmax(weights) out = x * K.permute_dimensions(K.repeat(weights, x.shape[2]), [0, 2, 1]) if self.op == 'attsum': # out = out.sum(axis=1) out = K.sum(out, axis=1) elif self.op == 'attmean': out = out.sum(axis=1) / mask.sum(axis=1, keepdims=True) return K.cast(out, K.floatx())
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, activation='linear'): '''Apply y.w + b for every temporal slice y of x. ''' activation = activations.get(activation) if not input_dim: # won't work with TensorFlow input_dim = K.shape(x)[2] if not timesteps: # won't work with TensorFlow timesteps = K.shape(x)[1] if not output_dim: # won't work with TensorFlow output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b: x = x + b # reshape to 3D tensor x = K.reshape(activation(x), (-1, timesteps, output_dim)) return x
def r2_keras(y_true, y_pred): y_true, y_pred = K.reshape(y_true, (-1, WRAP, 20)), K.reshape( y_pred, (-1, WRAP, 20)) SS_res = K.sum(K.square(y_true - y_pred), axis=1) SS_tot = K.sum(K.square(y_true - K.repeat(K.mean(y_true, axis=1), WRAP)), axis=1) return K.mean((1 - SS_res / (SS_tot + K.epsilon())))
def attention_call(self, inputs, cell_states, attended, attention_states, attended_mask, training=None): # only one attended sequence (verified in build) assert len(attended) == 1 attended = attended[0] attended_mask = attended_mask[0] h_cell_tm1 = cell_states[0] # compute attention weights w = K.repeat( K.dot(h_cell_tm1, self.W_a) + self.b_UW, K.shape(attended)[1]) u = K.dot(attended, self.U_a) # TODO should be done externally of cell e = K.exp(K.dot(K.tanh(w + u), self.v_a) + self.b_v) if attended_mask is not None: e = e * K.cast(K.expand_dims(attended_mask, -1), K.dtype(e)) # weighted average of attended a = e / K.sum(e, axis=1, keepdims=True) c = K.sum(a * attended, axis=1, keepdims=False) return c, [c]
def loss(y_true, y_pred): from plasma.conf import conf fac = MaxHingeTarget.fac #overall_fac = np.prod(np.array(K.shape(y_pred)[1:]).astype(np.float32)) overall_fac = K.prod(K.cast(K.shape(y_pred)[1:],K.floatx())) max_val = K.max(y_pred,axis=-2) #temporal axis! max_val1 = K.repeat(max_val,K.shape(y_pred)[-2]) mask = K.cast(K.equal(max_val1,y_pred),K.floatx()) y_pred1 = mask * y_pred + (1-mask) * y_true weight_mask = K.mean(y_true,axis=-1) weight_mask = K.cast(K.greater(weight_mask,0.0),K.floatx()) #positive label! weight_mask = fac*weight_mask + (1 - weight_mask) #return weight_mask*squared_hinge(y_true,y_pred1) return conf['model']['loss_scale_factor']*overall_fac*weight_mask*hinge(y_true,y_pred1)
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, x, mask=None): input_shape = self.input_spec[0].shape en_seq = x x_input = x[:, input_shape[1]-1, :] x_input = K.repeat(x_input, input_shape[1]) initial_states = self.get_initial_states(x_input) constants = super(PointerLSTM, self).get_constants(x_input) constants.append(en_seq) preprocessed_input = self.preprocess_input(x_input) last_output, outputs, states = K.rnn(self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, constants=constants, input_length=input_shape[1]) print ('outputs') print (outputs) return outputs
def step(self, x_input, states): input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) #dec_seq = K.repeat(h, 2) print ('dec_seq') print (dec_seq) Eij = time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) print ('U') print (U) U = K.squeeze(U, 2) print ('U squeezed') print (U) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def step(x, states, weights): H = x h_tm1, c_tm1 = states W1, W2, W3, U, b1, b2, b3 = weights input_length = K.shape(x)[1] C = K.repeat(c_tm1, input_length) _HC = K.concatenate([H, C]) _HC = K.reshape(_HC, (-1, input_dim + self.hidden_dim)) energy = K.dot(_HC, W3) + b3 energy = K.reshape(energy, (-1, input_length)) energy = K.softmax(energy) x = K.batch_dot(energy, H, axes=(1, 1)) z = K.dot(x, W1) + K.dot(h_tm1, U) + b1 z0 = z[:, :self.hidden_dim] z1 = z[:, self.hidden_dim: 2 * self.hidden_dim] z2 = z[:, 2 * self.hidden_dim: 3 * self.hidden_dim] z3 = z[:, 3 * self.hidden_dim:] i = self.inner_activation(z0) f = self.inner_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.inner_activation(z3) h = o * self.activation(c) y = self.activation(K.dot(h, W2) + b2) return y, [h, c]
def call(self, x, mask=None): H = x x = K.permute_dimensions(H, (1, 0, 2))[-1] if self.stateful or self.state_input or len(self.state_outputs) > 0: initial_states = self.states[:] else: initial_states = self.get_initial_states(H) constants = self.get_constants(H) + [H] y_0 = x x = K.repeat(x, self.output_length) initial_states += [y_0] last_output, outputs, states = K.rnn(self.step, x, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=self.output_length) if self.stateful and not self.state_input: self.updates = [] for i in range(2): self.updates.append((self.states[i], states[i])) self.states_to_transfer = states return outputs
def get_output(self, train=False): X = self.get_input(train) return K.repeat(X, self.n).dimshuffle(0, 2, 1)
(1,latent_dim), mean=0, scale=1) # Gaussian distribution (input_seq_lenth,1)) print ("vt") print (vt) print ("decoder_hidden") print (decoder_hidden) #en_seq = Reshape((-1,1,latent_dim))(encoder_outputs) #?,latent_dim #en_seq =K.squeeze(en_seq,0) en_seq = encoder_outputs #en_seq = K.repeat(en_seq, max_encoder_seq_length) print ("en_seq") print (en_seq) #dec_seq = Reshape((-1,1,latent_dim))(decoder_hidden) dec_seq = K.repeat(decoder_hidden, max_encoder_seq_length) #dec_seq = Reshape((-1,1,latent_dim))(dec_seq) #dec_seq = K.squeeze(dec_seq,0) print ("dec_seq") print (dec_seq) blendW1 = TimeDistributed(Dense(latent_dim))(en_seq) #blendW1 = TimeDistributed(Dense(latent_dim)(en_seq) #?,input_seq_length,latent_dim print ('blendW1') print (blendW1) #blendW2 = TimeDistributed(Dense(latent_dim),ouput_dim=1)(dec_seq) blendW2 = TimeDistributed(Dense(latent_dim))(dec_seq) print ('blendW2') print (blendW2)
def call(self, x, mask=None): mask = K.cast(mask, 'float32') mask = K.repeat(mask, self.repeat_dim) mask = K.permute_dimensions(mask, (0, 2, 1)) return x * mask
def build_model(self, input_shape): #input shape in None,input_len,hidden_dimension input_dim = input_shape[-1] output_dim = self.output_dim input_length = input_shape[1] hidden_dim = self.hidden_dim x = Input(batch_shape=input_shape) h_tm1 = Input(batch_shape=(input_shape[0], hidden_dim)) c_tm1 = Input(batch_shape=(input_shape[0], hidden_dim)) W1 = Dense(hidden_dim * 4, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer) W2 = Dense(output_dim, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer) W3 = Dense(1, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer) U = Dense(hidden_dim * 4, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer) ''' 1. Lambda() returns a function 2. It is a keras thing. It executes lambda expressions. **Parameters** >> output_shape: how do you want your output. >> masks... lambda x: K.repeat(x, input_length) lambda: declaration x:y -> f(x) = y Inputlength: number of encoder unfoldings x = one (maybe the last one) encoder output. ''' C = Lambda(lambda x: K.repeat(x, input_length), output_shape=(input_length, input_dim))(c_tm1) _xC = concatenate([x, C]) _xC = Lambda(lambda x: K.reshape(x, (-1, input_dim + hidden_dim)), output_shape=(input_dim + hidden_dim,))(_xC) #essentially transpose ''' alpha is softmax over input length ''' alpha = W3(_xC) alpha = Lambda(lambda x: K.reshape(x, (-1, input_length)), output_shape=(input_length,))(alpha) alpha = Activation('softmax')(alpha) _x = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=(1, 1)), output_shape=(input_dim,))([alpha, x]) z = add([W1(_x), U(h_tm1)]) z0, z1, z2, z3 = get_slices(z, 4) i = Activation(self.recurrent_activation)(z0) f = Activation(self.recurrent_activation)(z0) c = add([multiply([f, c_tm1]), multiply([i, Activation(self.activation)(z2)])]) o = Activation(self.recurrent_activation)(z3) h = multiply([o, Activation(self.activation)(c)]) y = Activation(self.activation)(W2(h)) return Model([x, h_tm1, c_tm1], [y, h, c])
def build_model(self, input_shape): input_dim = input_shape[-1] output_dim = self.output_dim input_length = input_shape[1] hidden_dim = self.hidden_dim print "the input shape is ", input_shape, "hidden shape ", hidden_dim # print input_shape # print hidden_dim # raw_input("Verify Shapes") # x = K.variable(np.random.rand(1,input_shape[1],input_shape[2])) x = Input(batch_shape=input_shape) # Slicing doesn't work # slice_layer = Lambda(self.slice,output_shape=(1,hidden_dim)) # x_tm1 = slice_layer(x) #Transposing, forget it. # x_tm1 = K.transpose(x_tm1) #Does not work! # Let's try flattening inputs instead x_tm1 = Lambda(self.custom_flatten, output_shape=(input_shape[0], input_length*hidden_dim))(x) # x_tm1 = K.batch_flatten(x) h_tm1 = Input(batch_shape=(input_shape[0], hidden_dim)) c_tm1 = Input(batch_shape=(input_shape[0], hidden_dim)) # h_tm1 = K.variable(np.random.rand(1,hidden_dim)) # c_tm1 = K.variable(np.random.rand(1,hidden_dim)) W1 = Dense(hidden_dim * 4, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, use_bias=False, input_shape=(hidden_dim*input_length,), name="W1") W2 = Dense(output_dim, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer) W3 = Dense(1, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, use_bias=False, name="W3") U = Dense(hidden_dim * 4, kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, use_bias=False, name="U") # print K.eval(x).shape # print K.eval(x_tm1).shape # print K.eval(h_tm1).shape # raw_input('check the dimenbasipon f0r x and h') # print "x_tm1" # print K.eval(x_tm1) # print K.eval(x_tm1).shape # raw_input("Berry Berry Berrifyxxxx") # print "W1 dot x_tm1" # print K.eval(W1(x_tm1)) # print K.eval(W1(x_tm1)).shape # raw_input("Berry Berry Berrify") z = add([W1(x_tm1), U(h_tm1)]) z0, z1, z2, z3 = get_slices_custom(z, 4, 4*hidden_dim) i = Activation(self.recurrent_activation)(z0) f = Activation(self.recurrent_activation)(z1) temp1 = multiply([f, c_tm1]) temp2 = multiply([i, Activation(self.activation)(z2)]) c = add([temp1, temp2]) # c = add([multiply([f, c_tm1]), multiply([i, Activation(self.activation)(z2)])]) o = Activation(self.recurrent_activation)(z3) h = multiply([o, Activation(self.activation)(c)]) # #Treating h as d_i (wrt Pointer Network nomenclature https://arxiv.org/pdf/1506.03134.pdf) H = Lambda(lambda x: K.repeat(x, input_length), output_shape=(input_length, input_dim))(h) _xH = concatenate([x, H]) _xH = Lambda(lambda x: K.reshape(x, (-1, input_dim + hidden_dim)), output_shape=(input_dim + hidden_dim,))(_xH) # print K.eval(_xH) # print K.eval(_xH).shape # raw_input("Verify Shapes _xH") alpha = W3(_xH) alpha = Lambda(lambda x: K.reshape(x, (-1, input_length)), output_shape=(input_length,))(alpha) #Transpose alpha = W2(alpha) alpha = Activation('softmax')(alpha) # softer = Lambda(self.custom_soft_max,output_shape=(input_length,)) # alphas = softer(alpha) return Model([x, h_tm1, c_tm1], [alpha, h, c])