 def _reshape_to_batches(self, x):
     input_shape = K.shape(x)
     batch_size, seq_len, feature_dim = input_shape[0], input_shape[
         1], input_shape[2]
     x = K.reshape(x, (batch_size, seq_len, self.num_head, self.units_head))
     x = K.permute_dimensions(x, [0, 2, 1, 3])
     return K.reshape(
         x, (batch_size * self.num_head, seq_len, self.units_head))
 def _reshape_to_batches(x, head_num):
     input_shape = K.shape(x)
     batch_size, seq_len, feature_dim = input_shape[0], input_shape[
         1], input_shape[2]
     head_dim = feature_dim // head_num
     x = K.reshape(x, (batch_size, seq_len, head_num, head_dim))
     x = K.permute_dimensions(x, [0, 2, 1, 3])
     return K.reshape(x, (batch_size * head_num, seq_len, head_dim))
 def _relative_shift(x, key_len_expected=-1):
     batch_size, q_len, k_len = K.shape(x)[0], K.shape(x)[1], K.shape(x)[2]
     x = K.reshape(
         x, (batch_size, k_len,
             q_len))  # (batch * n_head, prev_len + seq_len + 1, seq_len)
     x = x[:, 1:, :]  # (batch * n_head, prev_len + seq_len, seq_len)
     x = K.reshape(x, (batch_size, q_len, k_len -
                       1))  # (batch * n_head, seq_len, prev_len + seq_len)
     x = tf.slice(
         x, (0, 0, 0),
         (-1, -1,
          key_len_expected))  # (batch * n_head, seq_len, key_len_expected)
     return x
 def _reshape_mask(mask, head_num):
     if mask is None:
         return mask
     seq_len = K.shape(mask)[1]
     mask = K.expand_dims(mask, axis=1)
     mask = K.tile(mask, [1, head_num, 1])
     return K.reshape(mask, (-1, seq_len))
    def _call_additive_emission(self, inputs):
        input_shape = K.shape(inputs)
        batch_size, input_len = input_shape[0], input_shape[1]

        # h_{t, t'} = \tanh(x_t^T W_t + x_{t'}^T W_x + b_h)
        q = K.expand_dims(K.dot(inputs, self.Wt), 2)
        k = K.expand_dims(K.dot(inputs, self.Wx), 1)
        if self.use_additive_bias:
            h = K.tanh(q + k + self.bh)
            h = K.tanh(q + k)

        # e_{t, t'} = W_a h_{t, t'} + b_a
        if self.use_attention_bias:
            e = K.reshape(K.dot(h, self.Wa) + self.ba, (batch_size, input_len, input_len))
            e = K.reshape(K.dot(h, self.Wa), (batch_size, input_len, input_len))
        return e
 def call(self, inputs, mask=None):
     if isinstance(inputs, list):
         q, k, v = inputs
         q = k = v = inputs
     if isinstance(mask, list):
         q_mask, k_mask, v_mask = mask
         q_mask = k_mask = v_mask = mask
     q = K.dot(q, self.Wq)
     k = K.dot(k, self.Wk)
     v = K.dot(v, self.Wv)
     if self.use_bias:
         q += self.bq
         k += self.bk
         v += self.bv
     if self.activation is not None:
         q = self.activation(q)
         k = self.activation(k)
         v = self.activation(v)
     y = ScaledDotProductAttention(
         name='%s-Attention' % self.name,
             self._reshape_to_batches(q, self.head_num),
             self._reshape_to_batches(k, self.head_num),
             self._reshape_to_batches(v, self.head_num),
             self._reshape_mask(q_mask, self.head_num),
             self._reshape_mask(k_mask, self.head_num),
             self._reshape_mask(v_mask, self.head_num),
     y = self._reshape_from_batches(y, self.head_num)
     y = K.dot(y, self.Wo)
     if self.use_bias:
         y += self.bo
     if self.activation is not None:
         y = self.activation(y)
     if TF_KERAS:
         # Add shape information to tensor when using `tf.keras`
         input_shape = [K.int_shape(q), K.int_shape(k), K.int_shape(v)]
         output_shape = self.compute_output_shape(input_shape)
         if output_shape[1] is not None:
             output_shape = (-1, ) + output_shape[1:]
             y = K.reshape(y, output_shape)
     return y
 def call(self, x, mask=None):
     logits = K.dot(x, self.W)
     if self.use_bias:
         logits += self.b
     x_shape = K.shape(x)
     logits = K.reshape(logits, (x_shape[0], x_shape[1]))
     ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         ai = ai * mask
     att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
     weighted_input = x * K.expand_dims(att_weights)
     result = K.sum(weighted_input, axis=1)
     if self.return_attention:
         return [result, att_weights]
     return result
 def call(self, inputs, mask=None):
     input_shape = K.shape(inputs)
     if self.mode == self.MODE_ADD:
         batch_size, seq_len, output_dim = input_shape[0], input_shape[
             1], input_shape[2]
         pos_input = K.tile(K.expand_dims(K.arange(0, seq_len), axis=0),
                            [batch_size, 1])
     elif self.mode == self.MODE_CONCAT:
         batch_size, seq_len, output_dim = input_shape[0], input_shape[
             1], self.output_dim
         pos_input = K.tile(K.expand_dims(K.arange(0, seq_len), axis=0),
                            [batch_size, 1])
         output_dim = self.output_dim
         pos_input = inputs
     if K.dtype(pos_input) != K.floatx():
         pos_input = K.cast(pos_input, K.floatx())
     evens = K.arange(0, output_dim // 2) * 2
     odds = K.arange(0, output_dim // 2) * 2 + 1
     even_embd = K.sin(
             K.expand_dims(pos_input, -1),
                 1.0 / K.pow(
                     K.cast(evens, K.floatx()) /
                     K.cast(output_dim, K.floatx())), 0)))
     odd_embd = K.cos(
             K.expand_dims(pos_input, -1),
                 1.0 / K.pow(
                     K.cast((odds - 1), K.floatx()) /
                     K.cast(output_dim, K.floatx())), 0)))
     embd = K.stack([even_embd, odd_embd], axis=-1)
     output = K.reshape(embd, [-1, K.shape(inputs)[1], output_dim])
     if self.mode == self.MODE_CONCAT:
         output = K.concatenate([inputs, output], axis=-1)
     if self.mode == self.MODE_ADD:
         output += inputs
     return output
    def call(self, inputs, mask=None, training=None):
        (inputs, content, memories, segment_mat, segment_embed, relatives,
         bias_context, bias_relative, bias_segment, permutation) = inputs
        full = K.concatenate([memories, content],
                             axis=1)  # (batch, prev_len + seq_len, units)

        kernel_q = self.kernel[:, :self.units]
        kernel_kv = self.kernel[:, self.units:self.units * 3]
        kernel_r = self.kernel[:, self.units * 3:self.units * 4]
        kernel_o = self.kernel[:, self.units * 4:self.units * 5]

        bias_q, bias_kv, bias_r, bias_o = (None, ) * 4
        if self.use_bias:
            bias_q = self.bias[:self.units]
            bias_kv = self.bias[self.units:self.units * 3]
            bias_r = self.bias[self.units * 3:self.units * 4]
            bias_o = self.bias[self.units * 4:self.units * 5]

        w_q = K.dot(inputs, kernel_q)  # (batch, seq_len, units)
        w_kv = K.dot(full, kernel_kv)  # (batch, prev_len + seq_len, units * 2)
        w_r = K.dot(relatives, kernel_r)  # (batch, prev_len + seq_len, units)
        if self.use_bias:
            w_q = K.bias_add(w_q, bias_q)
            w_kv = K.bias_add(w_kv, bias_kv)
            w_r = K.bias_add(w_r, bias_r)
        if self.activation is not None:
            w_q = self.activation(w_q)
            w_kv = self.activation(w_kv)
            w_r = self.activation(w_r)

        w_k = w_kv[:, :, :self.units]  # (batch, prev_len + seq_len, units)
        w_v = w_kv[:, :, self.units:]  # (batch, prev_len + seq_len, units)
        batch_size, q_len, k_len = K.shape(inputs)[0], K.shape(
            w_q)[1], K.shape(w_k)[1]

        w_qc = K.bias_add(w_q, bias_context)
        w_qc = self._reshape_to_batches(
            w_qc)  # (batch * n_head, seq_len, units_head)
        w_k = self._reshape_to_batches(
            w_k)  # (batch * n_head, prev_len + seq_len, units_head)
        a_context = K.batch_dot(
            w_qc, w_k, axes=2)  # (batch * n_head, seq_len, prev_len + seq_len)

        w_qr = K.bias_add(w_q, bias_relative)
        w_qr = self._reshape_to_batches(
            w_qr)  # (batch * n_head, seq_len, units_head)
        w_r = self._reshape_to_batches(
            w_r)  # (batch * n_head, prev_len + seq_len, units_head)
        a_relative = K.batch_dot(
            w_qr, w_r, axes=2)  # (batch * n_head, seq_len, prev_len + seq_len)
        a_relative = self._relative_shift(  # (batch * n_head, seq_len, prev_len + seq_len)

        w_qs = K.bias_add(w_q, bias_segment)
        w_qs = K.reshape(w_qs, (-1, q_len, self.num_head, self.units_head))
        w_qs = K.permute_dimensions(
            w_qs, (2, 0, 1, 3))  # (n_head, batch, seq_len, units_head)
        segment_embed = K.reshape(K.transpose(segment_embed),
                                  (self.num_head, 1, self.units_head, 2))
        segment_embed = K.tile(segment_embed, (1, batch_size, 1, 1))
        a_segment = K.batch_dot(w_qs, segment_embed,
                                axes=(3, 2))  # (n_head, batch, seq_len, 2)
        a_segment = K.permute_dimensions(
            a_segment, (1, 2, 3, 0))  # (batch, seq_len, 2, n_head)
        a_segment = K.batch_dot(
            segment_mat, a_segment,
            axes=(3, 2))  # (batch, seq_len, prev_len + seq_len, n_head)
        a_segment = K.reshape(K.permute_dimensions(a_segment, (0, 3, 1, 2)),
                              (-1, q_len, k_len))

        att = (a_context + a_relative + a_segment) / K.sqrt(
            K.constant(self.units_head, dtype=K.floatx()))
        exp = K.exp(att - K.max(att, axis=-1, keepdims=True))

        permutation = K.tile(K.expand_dims(permutation, axis=1),
                             [1, self.num_head, 1, 1])
        permutation = K.reshape(permutation, (-1, q_len, k_len))
        exp *= permutation
        if mask is not None and mask[0] is not None:
            mask = K.cast(mask[0], K.floatx())
            mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask],
            exp *= K.expand_dims(self._reshape_mask(mask), axis=1)

        att = exp / (K.sum(exp, axis=-1, keepdims=True) + K.epsilon())
        if self.att_drop_layer is not None:
            att = self.att_drop_layer(att, training=training)
        w_v = self._reshape_to_batches(
            w_v)  # (batch * n_head, prev_len + seq_len, units_head)
        w_o = K.batch_dot(att, w_v)  # (batch * n_head, seq_len, units_head)

        w_o = self._reshape_from_batches(w_o)  # (batch, seq_len, units)
        w_o = K.dot(w_o, kernel_o)  # (batch, seq_len, units)
        if self.use_bias:
            w_o = K.bias_add(w_o, bias_o)
        if self.activation is not None:
            w_o = self.activation(w_o)

        if TF_KERAS:
            # Add shape information to tensor when using `tf.keras`
            input_shape = K.int_shape(inputs)
            if input_shape[1] is not None:
                w_o = K.reshape(w_o, (-1, ) + input_shape[1:])
        return w_o
 def _reshape_mask(self, mask):
     seq_len = K.shape(mask)[1]
     mask = K.expand_dims(mask, axis=1)
     mask = K.tile(mask, [1, self.num_head, 1])
     return K.reshape(mask, (-1, seq_len))