Exemplo n.º 1
0
    def call(self, inputs, training=None, mask=None):
        batch_size = tf.shape(inputs)[0]
        W_3d = tf.tile(tf.expand_dims(self.W, axis=0),
                       tf.stack([batch_size, 1, 1]))
        # [batch_size, steps, features]
        input_projection = tf.matmul(inputs, W_3d)

        if self.use_bias:
            input_projection += self.b

        input_projection = tf.tanh(input_projection)

        # [batch_size, steps, 1]
        similaritys = tf.reduce_sum(tf.multiply(input_projection,
                                                self.attention_context_vector),
                                    axis=2,
                                    keep_dims=True)

        # [batch_size, steps, 1]
        if mask is not None:
            attention_weights = masked_softmax(similaritys, mask, axis=1)
        else:
            attention_weights = tf.nn.softmax(similaritys, axis=1)

        # [batch_size, features]
        attention_output = tf.reduce_sum(tf.multiply(inputs,
                                                     attention_weights),
                                         axis=1)
        return attention_output
Exemplo n.º 2
0
 def grad_variance(self):
     grad_var_ops = []
     tensor_to_avg = []
     for t, g in zip(self._tvars, self._grads):
         if isinstance(g, ops.IndexedSlices):
             tensor_to_avg.append(
                 tf.reshape(tf.unsorted_segment_sum(g.values, g.indices,
                                                    g.dense_shape[0]),
                            shape=t.get_shape()))
         else:
             tensor_to_avg.append(g)
     avg_op = self._moving_averager.apply(tensor_to_avg)
     grad_var_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_avg = [
             self._moving_averager.average(val) for val in tensor_to_avg
         ]
         self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
     self._grad_var = tf.maximum(
         tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
         self._grad_norm_squared_avg -
         tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared]))
     if self._sparsity_debias:
         self._grad_var *= self._sparsity_avg
     return grad_var_ops
Exemplo n.º 3
0
def focal_loss(logits, labels, alpha, gamma=2, name='focal_loss'):
  """
    Focal loss for multi classification
    :param logits: A float32 tensor of shape [batch_size num_class].
    :param labels: A int32 tensor of shape [batch_size, num_class] or [batch_size].
    :param alpha: A 1D float32 tensor for focal loss alpha hyper-parameter
    :param gamma: A scalar for focal loss gamma hyper-parameter.
    Returns: A tensor of the same shape as `lables`
    """
  if len(labels.shape) == 1:
    labels = tf.one_hot(labels, logits.shape[-1])
  else:
    labels = labels
  labels = tf.to_float(labels)

  y_pred = tf.nn.softmax(logits, dim=-1)
  L = -labels * tf.log(y_pred)
  L *= alpha * ((1 - y_pred)**gamma)
  loss = tf.reduce_sum(L)

  if tf.executing_eagerly():
    tf.contrib.summary.scalar(name, loss)
  else:
    tf.summary.scalar(name, loss)

  return loss
Exemplo n.º 4
0
def split_one_doc_to_true_len_sens(doc_t, split_token, padding_token,
                                   max_doc_len, max_sen_len):
    """
  Split a document to sentences with true sentence lengths.
  doc_t: [doc_word_len]
  out_t: [max_doc_len, max_sen_len]
  """
    if len(doc_t.get_shape()) == 1:
        split_token_index = tf.squeeze(tf.where(tf.equal(doc_t, split_token)),
                                       axis=1)
        split_token_index.set_shape([None])
        split_len_part_1 = split_token_index[:1] + 1
        split_len_part_2 = split_token_index[1:] - split_token_index[:-1]
        split_lens = tf.concat([split_len_part_1, split_len_part_2], axis=0)
        split_lens = cut_or_padding(split_lens,
                                    max_doc_len,
                                    padding_token=padding_token)
        new_doc_len = tf.reduce_sum(split_lens)
        splited_sentences = tf.split(doc_t[:new_doc_len], split_lens)
        splited_sentences = [
            cut_or_padding(s, max_sen_len) for s in splited_sentences
        ]
        out_t = tf.stack(splited_sentences)
        padding_tokens = tf.multiply(tf.ones_like(out_t, dtype=tf.int32),
                                     padding_token)
        out_t = tf.where(tf.equal(out_t, split_token), padding_tokens, out_t)
        return out_t

    raise ValueError("doc_t should be a tensor with rank 1.")
Exemplo n.º 5
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
Exemplo n.º 6
0
def compute_doc_lens(sen_lens):
    """
  Count how many sentences in a document.
  inputs: [..., time_steps]
  doc_lens: [...]
  """
    x_binary = tf.cast(tf.cast(sen_lens, tf.bool), tf.int32)
    doc_lens = tf.reduce_sum(x_binary, axis=-1)
    return doc_lens
Exemplo n.º 7
0
def masked_softmax(logits, mask, axis):
    """Compute softmax with input mask."""
    e_logits = tf.exp(logits)
    masked_e = tf.multiply(e_logits, mask)
    sum_masked_e = tf.reduce_sum(masked_e, axis, keep_dims=True)
    ones = tf.ones_like(sum_masked_e)
    # pay attention to a situation that if len of mask is zero,
    # denominator should be set to 1
    sum_masked_e_safe = tf.where(tf.equal(sum_masked_e, 0), ones, sum_masked_e)
    return masked_e / sum_masked_e_safe
Exemplo n.º 8
0
def compute_sen_lens(inputs, padding_token=0):
    """
  Count how many words in a sentence.
  inputs: [..., time_steps]
  sen_lens: [...]
  """
    x_binary = tf.cast(tf.not_equal(inputs, padding_token), tf.int32)
    sen_lens = tf.reduce_sum(x_binary, axis=-1)
    ones = tf.ones_like(sen_lens)
    sen_lens = tf.where(tf.equal(sen_lens, utils.PAD_IDX), x=ones, y=sen_lens)
    return sen_lens
Exemplo n.º 9
0
  def compute_lens(inputs, max_len):
    """count sequence length.
    input: [batch_size, max_len]
    lens: [batch_size]
    """

    x_binary = tf.cast(tf.cast(tf.reverse(inputs, axis=[1]), tf.bool), tf.int32)
    lens = max_len - tf.argmax(x_binary, axis=1, output_type=tf.int32)

    zeros = tf.zeros_like(lens, dtype=tf.int32)
    x_sum = tf.reduce_sum(inputs, axis=1)
    sen_lens = tf.where(tf.equal(x_sum, 0), zeros, lens)
    return sen_lens
Exemplo n.º 10
0
    def call(self, tensors):
        """Attention layer."""
        left, right = tensors

        len_left = left.shape[1]
        len_right = right.shape[1]
        tensor_left = tf.expand_dims(left, axis=2)
        tensor_right = tf.expand_dims(right, axis=1)
        tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1])
        tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1])
        tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1)
        middle_output = self.middle_layer(tensor_merged)
        attn_scores = self.attn(middle_output)
        attn_scores = tf.squeeze(attn_scores, axis=3)
        exp_attn_scores = tf.exp(
            attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True))
        exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True)
        attention_weights = exp_attn_scores / exp_sum
        return tf.matmul(attention_weights, right)
Exemplo n.º 11
0
    def before_apply(self):
        self._moving_averager = tf.train.ExponentialMovingAverage(
            decay=self._beta, zero_debias=self._zero_debias)
        assert self._grads is not None and len(self._grads) > 0
        before_apply_ops = []

        # get per var g**2 and norm**2
        self._grad_squared = []
        self._grad_norm_squared = []
        for v, g in zip(self._tvars, self._grads):
            if g is None:
                continue
            with ops.colocate_with(v):
                self._grad_squared.append(tf.square(g))
        self._grad_norm_squared = [
            tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared
        ]

        if self._sparsity_debias:
            avg_op_sparsity = self.grad_sparsity()
            before_apply_ops.append(avg_op_sparsity)

        # the following running average on squared norm of gradient is shared
        # by `grad_variance` and `dist_to_opt`
        avg_op = self._moving_averager.apply(self._grad_norm_squared)
        with tf.control_dependencies([avg_op]):
            self._grad_norm_squared_avg = [
                self._moving_averager.average(val)
                for val in self._grad_norm_squared
            ]
            self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
            self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
        before_apply_ops.append(avg_op)

        with tf.control_dependencies([avg_op]):
            curv_range_ops = self.curvature_range()
            before_apply_ops += curv_range_ops
            grad_var_ops = self.grad_variance()
            before_apply_ops += grad_var_ops
            dist_to_opt_ops = self.dist_to_opt()
            before_apply_ops += dist_to_opt_ops
        return tf.group(*before_apply_ops)
Exemplo n.º 12
0
 def exclude_padding(self, batch):
     x_binary = tf.cast(tf.not_equal(batch, utils.PAD_IDX), tf.int32)
     sen_lens = tf.reduce_sum(x_binary, axis=-1)
     return batch[:sen_lens]