def apply(self, h1, c, p_from_c): h_shape = K.shape(h1) # time_stpes, 1, nb_samples, dim p_from_h = K.expand_dims(K.dot(h1, self.B_hp) + self.b_tt, axis=1) # 1, time_stpes, nb_samples, dim p_from_c = K.expand_dims(p_from_c, axis=0).repeat(h_shape[0], axis=0) p = p_from_h + p_from_c # energy = exp(dot(tanh(p), self.D_pe) + self.c_tt).reshape((source_len, target_num)) # since self.c_tt has nothing to do with the probs, why? since it contributes an e^c_tt() to the the denominator and nominator # note: self.D_pe has a shape of (hidden_output_dim,1) # time_steps, nb_samples, 1 energy = K.exp(K.dot(K.tanh(p), self.D_pe)) #normalizer = K.sum(energy, axis=1, keepdims=True) print "--sum--attention:normalizer" normalizer = sum_op.Sum_op(keepdim=True, dimension=1)(energy) probs = energy / normalizer probs = K.squeeze(probs, axis=3) c = K.expand_dims(c, axis=0).repeat(h_shape[0], axis=0) #ctx = K.sum(c * K.expand_dims(probs), axis=1) print "--sum--attention:ctx" ctx = sum_op.Sum_op(keepdim=True, dimension=1)(c * K.expand_dims(probs)) ctx = K.squeeze(ctx, axis=1) return [ctx, probs]
def apply(self, state_below, mask_below=None, init_state=None, context=None): if K.ndim(state_below) == 2: state_below = K.expand_dims(state_below, 1) if mask_below is None: #mask_below = K.ones_like(K.sum(state_below, axis=2, keepdims=True)) print "--sum--MKL_GRU" mask_below = K.ones_like( sum_op.Sum_op(keepdim=True, dimension=2)(state_below)) if K.ndim(mask_below) == 2: mask_below = K.expand_dims(mask_below) #if init_state is None: # nb_samples,n_hids #init_state = K.repeat_elements(K.expand_dims(K.zeros_like(K.sum(state_below, axis=[0, 2]))), self.n_hids, axis=1) print "--sum--MKL_GRU" tmp1 = sum_op.Sum_op(keepdim=True, dimension=0)(state_below) tmp2 = K.squeeze(tmp1, 0) print "--sum--MKL_GRU" tmp2 = sum_op.Sum_op(keepdim=True, dimension=1)(tmp2) tmp = K.squeeze(tmp2, 1) init_state = K.repeat_elements(K.expand_dims(K.zeros_like(tmp)), self.n_hids, axis=1) ''' state_below_xh = K. dot(state_below, self.W_xh) state_below_xz = K. dot(state_below, self.W_xz) state_below_xr = K.dot(state_below, self.W_xr) sequences = [state_below_xh, state_below_xz, state_below_xr, mask_below] fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step(x_h, x_z, x_r, x_m, h_tm1) rval = K.scan(fn, sequences=sequences, outputs_initials=init_state, name=_p(self.pname, 'layers')) self.output = rval ''' #print('1,', K.ndim(init_state)) #print(init_state.shape) #exit() self.output = self.GRU_op(state_below, self.W_x, self.W_h, init_state, self.b)[0] #print('2,', K.ndim(self.output)) return self.output
def Sum_op(dimension): t = T.tensor3("t", dtype='float64') r = T.tensor3("r", dtype='float64') r = sum_op.Sum_op(dimension=dimension, keepdim=True)(t) loss = r.sum() gt = theano.grad(loss, t) f = theano.function([t], [r, gt]) # theano.printing.pydotprint(f, outfile='sum_bw.png', var_with_name_simple=True) return f
def get_category_cross_entropy_from_flat_logits(logits_flat, targets, mask=None): assert K.ndim(targets) == 2 # time_steps * nb_samples nb_samples = K.cast(K.shape(targets)[1], K.dtype(logits_flat)) targets_flat = K.flatten(targets) if K._BACKEND == 'tensorflow': ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits_flat, K.cast(targets_flat, 'int64')) else: # Theano will internally call one hot version if the two dims do not match ce = K.categorical_crossentropy(output = logits_flat, target = targets_flat, from_logits=True) if mask is not None: mask_flat = K.flatten(mask) ce *= mask_flat print "--sum--get_category_cross_entropy_from_flat_logits" tmp = sum_op.Sum_op(keepdim=True, dimension=0)(ce) tmp = K.squeeze(tmp,0) #return K.sum(ce) / nb_samples return tmp / nb_samples
def build_trainer(self, src, src_mask, trg, trg_mask, ite, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000,mlsl_obj=None,dist=None): src_mask_3d = K.expand_dims(src_mask) trg_mask_3d = K.expand_dims(trg_mask) annotations = self.encoder.apply(src, src_mask_3d) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling print "--sum--build_trainer" #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d) tmp1 = K.squeeze(tmp,0) tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d) tmp2 = K.squeeze(tmp,0) init_context = tmp1 / tmp2 trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) # apply dropout if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) self.cost = calc_loss_from_readout(readout=readout, targets=trg, targets_mask=trg_mask_3d, logisticRegressionLayer=self.logistic_layer, softmax_output_num_sampled=softmax_output_num_sampled) # for reconstruction #self.L1 = sum([K.sum(K.abs(param)) for param in self.params]) #self.L2 = sum([K.sum(K.square(param)) for param in self.params]) print "--sum--build_trainerL1L2" self.L1 = sum([sum_op.Sum_op(keepdim=True)(K.abs(param)) for param in self.params]) self.L2 = sum([sum_op.Sum_op(keepdim=True)(K.square(param)) for param in self.params]) params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight # train cost train_cost = self.cost + params_regular # gradients grads = K.gradients(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(mlsl_obj = mlsl_obj, dist = dist, parameters = self.params, gradients = grads) # train function inps = [src, src_mask, trg, trg_mask] self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func')
def sampled_softmax_loss(weights, biases, num_sampled, num_classes, labels, inputs, mask=None, num_true=1, sampled_values=None, remove_accidental_hits=True): """Computes and returns the sampled softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. At inference time, you can compute full softmax probabilities with the expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`. See our [Candidate Sampling Algorithms Reference] (../../extras/candidate_sampling.pdf) Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007) ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. inputs: A `Tensor` of shape `[time steps, batch_size, dim]`. The forward activations of the input network. mask: A tensor of shape [time_steps, batch_size,1]. labels: A `Tensor` of type `int64` and shape `[time_steps,batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: A `batch_size` 1-D tensor of per-example sampled softmax losses. """ assert K.ndim(inputs) == 3 # time_steps, number_samples, input_dim nb_samples = K.cast(K.shape(inputs)[1], K.dtype(weights)) inputs = K.reshape(inputs, (-1, K.shape(inputs)[2])) labels = K.reshape(labels, (-1, 1)) labels = K.cast(labels, 'int64') ce = tf.nn.sampled_softmax_loss(weights=weights, biases=biases, inputs=inputs, labels=labels, num_sampled=num_sampled, num_classes=num_classes, num_true=num_true, sampled_values=sampled_values, remove_accidental_hits=remove_accidental_hits) if mask is not None: mask_flat = K.flatten(mask) # time_steps*nb_samples ce *= mask_flat print "--sum--sampled_softmax_loss" tmp = sum_op.Sum_op(keepdim=True, dimension=0)(ce) tmp = K.squeeze(tmp,0) #return K.sum(ce) / nb_samples return tmp / nb_samples
def calc_loss_with_model_parallel(self, src, src_mask_3d, trg, trg_mask_3d, ps_device, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6): assert K._BACKEND == 'tensorflow' with tf.device(devices[0]): annotations = self.encoder.apply(src, src_mask_3d) #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) print "--sum--calc_loss_with_model_parallel" tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d) tmp1 = K.squeeze(tmp,0) tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d) tmp2 = K.squeeze(tmp,0) init_context = tmp1 / tmp2 trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) logits = self.logistic_layer.get_logits_with_multiple_devices(readout, ps_device, devices) with tf.device(devices[0]): logits_flat = K.reshape(logits, shape=(-1, self.logistic_layer.n_out)) cost = get_category_cross_entropy_from_flat_logits(logits_flat, trg, trg_mask_3d) if self.with_reconstruction: with tf.device(devices[0]): inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0) src_emb = self.table_src.apply(src) src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions( src_emb, [1, 0, 2])), [1, 0, 2]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask_3d, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask_3d) with tf.device(devices[0]): if self.dropout > 0.: inverse_readout = Dropout(inverse_readout, self.dropout) inverse_logits = self.inverse_logistic_layer.get_logits_with_multiple_devices(inverse_readout, ps_device, devices) with tf.device(devices[0]): inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out)) reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d) with tf.device(devices[0]): cost += reconstruction_cost * self.reconstruction_weight #L1 = sum([K.sum(K.abs(param)) for param in self.params]) #L2 = sum([K.sum(K.square(param)) for param in self.params]) print "--sum--calc_loss_with_model_parallelL1L2" L1 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.abs(param)), 0)for param in self.params]) L2 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.square(param)), 0)for param in self.params]) params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight cost += params_regular return cost
def calc_loss(self, src, src_mask_3d, trg, trg_mask_3d, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): annotations = self.encoder.apply(src, src_mask_3d) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) print "--sum--calc_loss" tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d) tmp1 = K.squeeze(tmp,0) tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d) tmp2 = K.squeeze(tmp,0) init_context = tmp1 / tmp2 trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, alignment = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) # apply dropout if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) cost = calc_loss_from_readout(readout=readout, targets=trg, targets_mask=trg_mask_3d, logisticRegressionLayer=self.logistic_layer, softmax_output_num_sampled=softmax_output_num_sampled) if self.with_reconstruction: inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0) src_emb = self.table_src.apply(src) src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(src_emb, [1, 0, 2])), [1, 0, 2]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask_3d, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask_3d) if self.dropout > 0.: inverse_readout = Dropout(inverse_readout, self.dropout) inverse_logits = self.inverse_logistic_layer.get_logits(inverse_readout) inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out)) reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d) cost += reconstruction_cost * self.reconstruction_weight #L1 = sum([K.sum(K.abs(param)) for param in self.params]) #L2 = sum([K.sum(K.square(param)) for param in self.params]) print "--sum--calc_lossL1L2" L1 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.abs(param)), 0)for param in self.params]) L2 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.square(param)), 0)for param in self.params]) params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight cost += params_regular return cost
def apply(self, state_below, mask_below=None, init_state=None, init_context=None, c=None, c_mask=None, one_step=False, cov_before=None, fertility=None): # assert c, 'Context must be provided' # assert c.ndim == 3, 'Context must be 3-d: n_seq * batch_size * dim' # state_below: n_steps * batch_size/1 * embedding # mask if mask_below is None: # sampling or beamsearch #mask_below = K.ones_like(K.sum(state_below, axis=-1, keepdims=True)) # nb_samples print "--sum--decoder" mask_below = K.ones_like( sum_op.Sum_op(keepdim=True, dimension=-1)(state_below)) if K.ndim(mask_below) != K.ndim(state_below): mask_below = K.expand_dims(mask_below) assert K.ndim(mask_below) == K.ndim(state_below) if one_step: assert init_state is not None, 'previous state must be provided' if init_state is None: init_state = self.create_init_state(init_context) state_below_xh = K.dot(state_below, self.W_xh) state_below_xz = K.dot(state_below, self.W_xz) state_below_xr = K.dot(state_below, self.W_xr) if self.with_attention: # time steps, nb_samples, n_hids p_from_c = K.reshape(K.dot(c, self.A_cp), shape=(K.shape(c)[0], K.shape(c)[1], self.n_hids)) else: c_z = K.dot(init_context, self.W_cz) c_r = K.dot(init_context, self.W_cr) c_h = K.dot(init_context, self.W_ch) if one_step: if self.with_attention: return self._step_attention(state_below_xh, state_below_xz, state_below_xr, mask_below, init_state, c, c_mask, p_from_c, cov_tm1=cov_before, fertility=fertility) else: return self._step_context(state_below_xh, state_below_xz, state_below_xr, mask_below, init_state, c_z, c_r, c_h, init_context) else: sequences = [ state_below_xh, state_below_xz, state_below_xr, mask_below ] # decoder hidden state outputs_info = [init_state] if self.with_attention: # ctx, probs if K._BACKEND == 'theano': outputs_info += [None, None] else: outputs_info += [ K.zeros_like(K.sum(c, axis=0)), K.zeros_like(K.sum(c, axis=-1)) ] if self.with_coverage: # initialization for coverage # TODO: check c is 3D init_cov = K.repeat_elements(K.expand_dims( K.zeros_like(K.sum(c, axis=2))), self.coverage_dim, axis=2) outputs_info.append(init_cov) # fertility is not constructed outside when training if self.coverage_type is 'linguistic': fertility = self._get_fertility(c) else: fertility = K.zeros_like(K.sum(c, axis=2)) if K._BACKEND == 'theano': fn = lambda x_h, x_z, x_r, x_m, h_tm1, cov_tm1: self._step_attention( x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c, cov_tm1=cov_tm1, fertility=fertility) else: fn = lambda (h_tm1, ctx_tm1, probs_tm1, cov_tm1), ( x_h, x_z, x_r, x_m): self._step_attention( x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c, cov_tm1=cov_tm1, fertility=fertility) else: if K._BACKEND == 'theano': if self.mkl == True: print('with mkl') #alignment GRU W_x_a = K.concatenate( [self.W_xh, self.W_xz, self.W_xr], axis=0) W_h_a = K.concatenate( [self.W_n1_z, self.W_n1_r, self.W_n1_h], axis=0) b_a = K.concatenate( [self.b_n1_z, self.b_n1_r, self.b_n1_h], axis=0) hidden_alignment = self.GRU_op( state_below, W_x_a, W_h_a, init_state, b_a)[0] #attention ctx, probs = self.attention_.apply( hidden_alignment, c, p_from_c) #decoder GRU W_x_c = K.concatenate( [self.W_cz, self.W_cr, self.W_ch], axis=0) W_h_c = K.concatenate( [self.W_hz, self.W_hr, self.W_hh], axis=0) b_c = K.concatenate([self.b_z, self.b_r, self.b_h], axis=0) init = hidden_alignment[ K.shape(hidden_alignment)[0] - 1, :, :] hidden_decoder = self.GRU_op( ctx, W_x_c, W_h_c, init, b_c)[0] self.output = [hidden_decoder, ctx, probs] else: fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step_attention( x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c) else: fn = lambda (h_tm1, ctx_tm1, probs_tm1), ( x_h, x_z, x_r, x_m): self._step_attention( x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c) else: if K._BACKEND == 'theano': fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step_context( x_h, x_z, x_r, x_m, h_tm1, c_z, c_r, c_h, init_context) else: fn = lambda (h_tm1, ), ( x_h, x_z, x_r, x_m): self._step_context( x_h, x_z, x_r, x_m, h_tm1, c_z, c_r, c_h, init_context) if self.mkl == False: self.output = K.scan(fn, sequences=sequences, outputs_initials=outputs_info, name=_p(self.pname, 'layers')) return self.output
def _step_attention(self, x_h, x_z, x_r, x_m, h_tm1, c, c_m, p_from_c, cov_tm1=None, fertility=None): ''' x_h: input at time t x_z: update of input x_r: reset of input x_m: mask of x_t h_tm1: previous state cov_tm1: coverage at time (t-1) fertility: fertility of individual source word ''' # here h1 combines previous hidden state and lastly generated word with GRU # note that this is different from the paper z1 = K.sigmoid(K.dot(h_tm1, self.W_n1_z) + x_z + self.b_n1_z) r1 = K.sigmoid(K.dot(h_tm1, self.W_n1_r) + x_r + self.b_n1_r) h1 = K.tanh(r1 * K.dot(h_tm1, self.W_n1_h) + x_h + self.b_n1_h) # nb_samples, n_hids h1 = z1 * h_tm1 + (1. - z1) * h1 h1 = x_m * h1 + (1. - x_m) * h_tm1 # 1, nb_samples, dim p_from_h = K.expand_dims(K.dot(h1, self.B_hp) + self.b_tt, axis=0) # time_stpes, nb_samples, dim p = p_from_h + p_from_c if self.with_coverage: p_from_cov = K.dot(cov_tm1, self.C_covp) p += p_from_cov # energy = exp(dot(tanh(p), self.D_pe) + self.c_tt).reshape((source_len, target_num)) # since self.c_tt has nothing to do with the probs, why? since it contributes an e^c_tt() to the the denominator and nominator # note: self.D_pe has a shape of (hidden_output_dim,1) # time_steps, nb_samples, 1 energy = K.exp(K.dot(K.tanh(p), self.D_pe)) # c_m: time_steps, nb_samples if c_m is not None: energy *= c_m print "--sum--attention ori:normalizer" #normalizer = K.sum(energy, axis=0, keepdims=True) normalizer = sum_op.Sum_op(keepdim=True, dimension=0)(energy) probs = energy / normalizer probs = K.squeeze(probs, axis=2) print "--sum--attention ori:ctx" #ctx = K.sum(c * K.expand_dims(probs), axis=0) ctx = sum_op.Sum_op(keepdim=True, dimension=0)(c * K.expand_dims(probs)) ctx = K.squeeze(ctx, axis=0) # update coverage after producing attention probabilities at time t if self.with_coverage: cov = self._update_coverage(cov_tm1, probs, c, h_tm1, fertility) # this is even more consistent with our context gate # h1 corresponds to target context, while ctx corresponds to source context if self.with_context_gate: gate = K.sigmoid( K.dot(h1, self.W_ctx_h) + K.dot(ctx, self.W_ctx_c) + self.b_ctx) # we directly scale h1, since it used in computing both can_h_t and h_t h1 = h1 * (1. - gate) else: gate = 1. z_t = K.sigmoid( K.dot(h1, self.W_hz) + gate * K.dot(ctx, self.W_cz) + self.b_z) r_t = K.sigmoid( K.dot(h1, self.W_hr) + gate * K.dot(ctx, self.W_cr) + self.b_r) h_t = K.tanh(r_t * K.dot(h1, self.W_hh) + gate * K.dot(ctx, self.W_ch) + self.b_h) h_t = z_t * h1 + (1. - z_t) * h_t h_t = x_m * h_t + (1. - x_m) * h1 if self.with_coverage: return [h_t, ctx, probs, cov] else: return [h_t, ctx, probs]