예제 #1
0
    def embedding_lookup(self,
                         input_ids,
                         vocab_size,
                         batch_size,
                         max_seq_length,
                         embedding_size=128,
                         initializer_range=0.02,
                         word_embedding_name='word_embeddings',
                         dtype=tf.float32,
                         trainable=True,
                         tilda_embeddings=None):
        if input_ids.shape.ndims == 2:
            input_ids = tf.expand_dims(input_ids, axis=[-1])

        if tilda_embeddings is not None:
            embedding_table = tilda_embeddings
        else:
            embedding_table = tf.get_variable(
                name=word_embedding_name,
                shape=[vocab_size, embedding_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

        flat_input_ids = tf.reshape(input_ids, [-1])
        output = tf.gather(embedding_table,
                           flat_input_ids,
                           name='embedding_look_up')
        output = tf.reshape(output,
                            [batch_size, max_seq_length, embedding_size])

        return (output, embedding_table)
예제 #2
0
def einsum_via_matmul(input_tensor, w, num_inner_dims):
    """Implements einsum via matmul and reshape ops.

  Args:
    input_tensor: float Tensor of shape [<batch_dims>, <inner_dims>].
    w: float Tensor of shape [<inner_dims>, <outer_dims>].
    num_inner_dims: int. number of dimensions to use for inner products.

  Returns:
    float Tensor of shape [<batch_dims>, <outer_dims>].
  """
    input_shape = util.get_shape_list(input_tensor)
    w_shape = util.get_shape_list(w)
    batch_dims = input_shape[:-num_inner_dims]
    inner_dims = input_shape[-num_inner_dims:]
    outer_dims = w_shape[num_inner_dims:]
    inner_dim = np.prod(inner_dims)
    outer_dim = np.prod(outer_dims)
    if num_inner_dims > 1:
        input_tensor = tf.reshape(input_tensor, batch_dims + [inner_dim])
    if len(w_shape) > 2:
        w = tf.reshape(w, [inner_dim, outer_dim])
    ret = tf.matmul(input_tensor, w)
    if len(outer_dims) > 1:
        ret = tf.reshape(ret, batch_dims + outer_dims)
    return ret
예제 #3
0
파일: electra.py 프로젝트: wangbq18/unif
def gather_positions(sequence, positions):
    '''Gathers the vectors at the specific positions over a minibatch.

    Args:
      sequence: A [batch_size, seq_length] or
          [batch_size, seq_length, depth] tensor of values
      positions: A [batch_size, n_positions] tensor of indices

    Returns: A [batch_size, n_positions] or
      [batch_size, n_positions, depth] tensor of the values at the indices
    '''
    shape = util.get_shape_list(sequence, expected_rank=[2, 3])
    depth_dimension = (len(shape) == 3)
    if depth_dimension:
        B, L, D = shape
    else:
        B, L = shape
        D = 1
        sequence = tf.expand_dims(sequence, -1)
    position_shift = tf.expand_dims(L * tf.range(B), -1)
    flat_positions = tf.reshape(positions + position_shift, [-1])
    flat_sequence = tf.reshape(sequence, [B * L, D])
    gathered = tf.gather(flat_sequence, flat_positions)
    if depth_dimension:
        return tf.reshape(gathered, [B, -1, D])
    else:
        return tf.reshape(gathered, [B, -1])
예제 #4
0
파일: crf.py 프로젝트: zhongyunuestc/unif
def crf_unary_score(tag_indices, sequence_lengths, inputs):
    ''' Computes the unary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.

    Returns:
      unary_scores: A [batch_size] vector of unary scores.
    '''
    batch_size = tf.shape(inputs)[0]
    max_seq_len = tf.shape(inputs)[1]
    num_tags = tf.shape(inputs)[2]

    flattened_inputs = tf.reshape(inputs, [-1])

    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
    # Use int32 or int64 based on tag_indices' dtype.
    if tag_indices.dtype == tf.int64:
        offsets = tf.cast(offsets, tf.int64)
    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])

    unary_scores = tf.reshape(
        tf.gather(flattened_inputs, flattened_tag_indices),
        [batch_size, max_seq_len])

    masks = tf.sequence_mask(sequence_lengths,
                             maxlen=tf.shape(tag_indices)[1],
                             dtype=tf.float32)

    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
    return unary_scores
예제 #5
0
파일: base.py 프로젝트: llgithubll/unif
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 sample_weight=None,
                 scope='mrc',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[2, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[2],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, 2])
            logits = tf.transpose(logits, [0, 2, 1])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            self.probs[name] = probs

            start_one_hot_labels = tf.one_hot(label_ids[:, 0],
                                              depth=seq_length,
                                              dtype=tf.float32)
            end_one_hot_labels = tf.one_hot(label_ids[:, 1],
                                            depth=seq_length,
                                            dtype=tf.float32)
            start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
            end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
            per_example_loss = (
                -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs,
                                     axis=-1) - 0.5 *
                tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1))
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses[name] = per_example_loss

            start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1),
                                         axis=-1)
            end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1),
                                       axis=-1)
            self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
예제 #6
0
파일: base.py 프로젝트: llgithubll/unif
    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        batch_size = tf.shape(input_tensor)[0]
        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            self.preds[name] = tf.argmax(logits, axis=-1)
            self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs,
                                               axis=-1)
            input_mask = tf.concat([
                tf.zeros((batch_size, 1), dtype=tf.float32),
                tf.cast(input_mask[:, 2:], dtype=tf.float32),
                tf.zeros((batch_size, 1), dtype=tf.float32)
            ],
                                   axis=-1)
            per_token_losses *= input_mask
            per_example_loss = tf.reduce_mean(per_token_losses, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.cast(sample_weight, dtype=tf.float32)

            self.losses[name] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
예제 #7
0
파일: electra.py 프로젝트: wangbq18/unif
def scatter_update(sequence, updates, positions):
    '''Scatter-update a sequence.

    Args:
      sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
      updates: A tensor of size batch_size*seq_len(*depth)
      positions: A [batch_size, n_positions] tensor

    Returns: A tuple of two tensors. First is a [batch_size, seq_len] or
      [batch_size, seq_len, depth] tensor of 'sequence' with elements at
      'positions' replaced by the values at 'updates.' Updates to index 0 are
      ignored. If there are duplicated positions the update is only applied
      once. Second is a [batch_size, seq_len] mask tensor of which inputs were
      updated.
    '''
    shape = util.get_shape_list(sequence, expected_rank=[2, 3])
    depth_dimension = (len(shape) == 3)
    if depth_dimension:
        B, L, D = shape
    else:
        B, L = shape
        D = 1
        sequence = tf.expand_dims(sequence, -1)
    N = util.get_shape_list(positions)[1]

    shift = tf.expand_dims(L * tf.range(B), -1)
    flat_positions = tf.reshape(positions + shift, [-1, 1])
    flat_updates = tf.reshape(updates, [-1, D])
    updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
    updates = tf.reshape(updates, [B, L, D])

    flat_updates_mask = tf.ones([B * N], tf.int32)
    updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
    updates_mask = tf.reshape(updates_mask, [B, L])
    not_first_token = tf.concat(
        [tf.zeros((B, 1), tf.int32),
         tf.ones((B, L - 1), tf.int32)], -1)
    updates_mask *= not_first_token
    updates_mask_3d = tf.expand_dims(updates_mask, -1)

    # account for duplicate positions
    if sequence.dtype == tf.float32:
        updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
        updates /= tf.maximum(1.0, updates_mask_3d)
    else:
        assert sequence.dtype == tf.int32
        updates = tf.divide(updates, tf.maximum(1, updates_mask_3d))
        updates = tf.cast(updates, tf.int32)
    updates_mask = tf.minimum(updates_mask, 1)
    updates_mask_3d = tf.minimum(updates_mask_3d, 1)

    updated_sequence = (((1 - updates_mask_3d) * sequence) +
                        (updates_mask_3d * updates))
    if not depth_dimension:
        updated_sequence = tf.squeeze(updated_sequence, -1)

    return updated_sequence, updates_mask
예제 #8
0
파일: spe.py 프로젝트: zhongyunuestc/unif
    def embedding_postprocessor(self,
                                input_tensor,
                                position_ids,
                                batch_size,
                                max_seq_length,
                                hidden_size,
                                use_token_type=False,
                                segment_ids=None,
                                token_type_vocab_size=16,
                                token_type_embedding_name=\
                                    'token_type_embeddings',
                                use_position_embeddings=True,
                                position_embedding_name='position_embeddings',
                                initializer_range=0.02,
                                max_position_embeddings=512,
                                dropout_prob=0.1,
                                dtype=tf.float32,
                                trainable=True):
        output = input_tensor

        if use_token_type:
            if segment_ids is None:
                raise ValueError(
                    'segment_ids must be specified if use_token_type is True.')
            token_type_table = tf.get_variable(
                name=token_type_embedding_name,
                shape=[token_type_vocab_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

            # This vocab will be small so we always do one-hot here,
            # since it is always faster for a small vocabulary.
            flat_segment_ids = tf.reshape(segment_ids, [-1])
            one_hot_ids = tf.one_hot(flat_segment_ids,
                                     depth=token_type_vocab_size,
                                     dtype=dtype)
            token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
            token_type_embeddings = tf.reshape(
                token_type_embeddings,
                [batch_size, max_seq_length, hidden_size])
            output += token_type_embeddings

        if use_position_embeddings:
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)
            output += tf.gather(full_position_embeddings, position_ids)

        output = util.layer_norm_and_dropout(output,
                                             dropout_prob,
                                             trainable=trainable)
        return output
예제 #9
0
파일: xlnet.py 프로젝트: zhongyunuestc/unif
def rel_shift(x, klen=-1):
    '''perform relative shift to form the relative attention score.'''
    x_size = tf.shape(x)

    x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
    x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
    x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
    x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])

    return x
예제 #10
0
파일: electra.py 프로젝트: wangbq18/unif
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
    '''Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
      ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialization range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True, use one-hot method for word
      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
      for TPUs.

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  '''
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    original_dims = input_ids.shape.ndims
    if original_dims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=util.create_initializer(initializer_range))

    if original_dims == 3:
        input_shape = util.get_shape_list(input_ids)
        tf.reshape(input_ids, [-1, input_shape[-1]])
        output = tf.matmul(input_ids, embedding_table)
        output = tf.reshape(output,
                            [input_shape[0], input_shape[1], embedding_size])
    else:
        if use_one_hot_embeddings:
            flat_input_ids = tf.reshape(input_ids, [-1])
            one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
            output = tf.matmul(one_hot_input_ids, embedding_table)
        else:
            output = tf.nn.embedding_lookup(embedding_table, input_ids)

        input_shape = util.get_shape_list(input_ids)

        output = tf.reshape(
            output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return output, embedding_table
예제 #11
0
            def _forward(target_ids, target_mask, target_max_seq_length):

                with tf.variable_scope('decoder'):

                    # shared embedding
                    dec = tf.nn.embedding_lookup(embedding_table, target_ids)
                    dec *= hidden_size ** 0.5  # scale
                    dec += positional_encoding(dec, target_max_seq_length)
                    dec = util.dropout(dec, dropout_rate)

                    # blocks
                    for i in range(num_blocks):
                        with tf.variable_scope('block_%s' % i):

                            # masked self-attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=dec,
                                values=dec,
                                key_masks=target_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=True,
                                scope='masked_self_attention')

                            # vanilla attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=memory,
                                values=memory,
                                key_masks=source_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=False,
                                scope='vanilla_attention')

                            # feed forward
                            dec = ff(
                                dec, num_units=[4 * hidden_size, hidden_size])

                # final linear projection (embedding weights are shared)
                with tf.variable_scope('cls'):
                    output_bias = tf.get_variable(
                        'output_bias', shape=[vocab_size],
                        initializer=tf.zeros_initializer())
                    dec = tf.reshape(dec, [-1, hidden_size])
                    logits = tf.matmul(dec, embedding_table, transpose_b=True)
                    logits = tf.reshape(
                        logits, [-1, target_max_seq_length, vocab_size])
                    logits = tf.nn.bias_add(logits, output_bias)

                return logits
예제 #12
0
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable(
            'w', [1, nx, nf],
            initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(
            tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b,
            start + [nf])
        return c
예제 #13
0
파일: crf.py 프로젝트: llgithubll/unif
    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=5,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            with tf.variable_scope('crf'):
                input_length = tf.reduce_sum(input_mask, axis=-1)
                per_example_loss, transition_matrix = \
                    contrib.crf.crf_log_likelihood(
                        inputs=logits,
                        tag_indices=label_ids,
                        sequence_lengths=input_length)
                per_example_loss = -per_example_loss
                if sample_weight is not None:
                    per_example_loss *= tf.cast(sample_weight,
                                                dtype=tf.float32)
                self.total_loss = tf.reduce_mean(per_example_loss)
                self.losses[name] = per_example_loss
                self.preds[name] = tf.argmax(logits, axis=-1)
                self.probs['logits'] = logits
                self.probs['transition_matrix'] = transition_matrix
예제 #14
0
        def gather_indexes(sequence_tensor, positions):
            sequence_shape = util.get_shape_list(sequence_tensor, 3)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            width = sequence_shape[2]

            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
            flat_positions = tf.reshape(positions + flat_offsets, [-1])
            flat_sequence_tensor = tf.reshape(sequence_tensor,
                                              [batch_size * seq_length, width])
            output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
            return output_tensor
예제 #15
0
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   head_size,
                   initializer,
                   activation,
                   use_einsum,
                   name=None,
                   trainable=True):
  """A dense layer with 3D kernel.

  Args:
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
    num_attention_heads: Number of attention heads.
    head_size: The size per attention head.
    initializer: Kernel initializer.
    activation: Actication function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """

  input_shape = util.get_shape_list(input_tensor)
  hidden_size = input_shape[2]

  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[hidden_size, num_attention_heads * head_size],
        initializer=initializer,
        trainable=trainable)
    w = tf.reshape(w, [hidden_size, num_attention_heads, head_size])
    b = tf.get_variable(
        name="bias",
        shape=[num_attention_heads * head_size],
        initializer=tf.zeros_initializer,
        trainable=trainable)
    b = tf.reshape(b, [num_attention_heads, head_size])
    if use_einsum:
      ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
    else:
      ret = einsum_via_matmul(input_tensor, w, 1)
    ret += b
  if activation is not None:
    return activation(ret)
  else:
    return ret
예제 #16
0
파일: crf.py 프로젝트: zhongyunuestc/unif
def crf_binary_score(tag_indices, sequence_lengths, transition_params):
    ''' Computes the binary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.

    Returns:
      binary_scores: A [batch_size] vector of binary scores.
    '''
    # Get shape information.
    num_tags = transition_params.get_shape()[0]
    num_transitions = tf.shape(tag_indices)[1] - 1

    # Truncate by one on each side of the sequence to get the start and end
    # indices of each transition.
    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])

    # Encode the indices in a flattened representation.
    flattened_transition_indices = \
        start_tag_indices * num_tags + end_tag_indices
    flattened_transition_params = tf.reshape(transition_params, [-1])

    # Get the binary scores based on the flattened representation.
    binary_scores = tf.gather(flattened_transition_params,
                              flattened_transition_indices)

    masks = tf.sequence_mask(sequence_lengths,
                             maxlen=tf.shape(tag_indices)[1],
                             dtype=tf.float32)
    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
    return binary_scores
예제 #17
0
 def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                          max_seq_length, width):
     output_tensor = tf.reshape(
         input_tensor,
         [batch_size, max_seq_length, num_attention_heads, width])
     output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
     return output_tensor
예제 #18
0
파일: electra.py 프로젝트: wangbq18/unif
def create_attention_mask_from_input_mask(from_tensor, to_mask):
    '''Create 3D attention mask from a 2D tensor mask.

  Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

  Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  '''
    from_shape = util.get_shape_list(from_tensor, expected_rank=[2, 3])
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]

    to_shape = util.get_shape_list(to_mask, expected_rank=2)
    to_seq_length = to_shape[1]

    to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
                      tf.float32)

    # We don't assume that `from_tensor` is a mask (although it could be). We
    # don't actually care if we attend *from* padding tokens (only *to* padding)
    # tokens so we create a tensor of all ones.
    #
    # `broadcast_ones` = [batch_size, from_seq_length, 1]
    broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1],
                             dtype=tf.float32)

    # Here we broadcast along two dimensions to create the mask.
    mask = broadcast_ones * to_mask

    return mask
예제 #19
0
 def mask_attn_weights(w):
     # w has shape [batch, heads, dst_sequence, src_sequence], where
     # information flows from src to dst.
     _, _, nd, ns = shape_list(w)
     b = attention_mask(nd, ns, dtype=w.dtype)
     b = tf.reshape(b, [1, 1, nd, ns])
     w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
     return w
예제 #20
0
def sample_from_softmax(logits, disallow=None):
    if disallow is not None:
        logits -= 1000.0 * tf.reshape(disallow, [-1, logits.shape[-1]])
    uniform_noise = tf.random_uniform(
        util.get_shape_list(logits), minval=0, maxval=1)
    gumbel_noise = -tf.log(-tf.log(uniform_noise + 1e-9) + 1e-9)
    return tf.one_hot(tf.argmax(logits + gumbel_noise, -1,
                                output_type=tf.int32), logits.shape[-1])
예제 #21
0
 def _get_pred_loss(self, teacher_logits, student_logits, weights):
     teacher_probs = tf.nn.softmax(teacher_logits, axis=-1)
     teacher_probs = tf.stop_gradient(teacher_probs)
     student_log_probs = tf.nn.log_softmax(student_logits, axis=-1)
     pred_loss = (
         - tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) *
         tf.reshape(weights, [-1, 1]))
     pred_loss = tf.reduce_mean(pred_loss)
     return pred_loss
예제 #22
0
                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)

                    paded = tf.concat([output_ids, right_pad], axis=-1)
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    nonpad_mask = tf.cast(tf.not_equal(cutted_valid, spad_id),
                                          dtype=tf.int32)
                    output_ids = cutted_valid * nonpad_mask

                    reshaped = tf.reshape(output_ids,
                                          [batch_size, max_seq_length, 1])
                    concatenated = tf.concat(
                        [reshaped, tf.zeros_like(reshaped)], axis=-1)
                    dilated_ids = tf.reshape(concatenated,
                                             [batch_size, max_seq_length * 2])

                    input_mask = tf.reduce_sum(nonpad_mask, axis=-1)
                    dilated_mask = tf.sequence_mask(input_mask,
                                                    dilated_seq_length,
                                                    dtype=tf.int32)

                    return dilated_ids, dilated_mask
예제 #23
0
 def create_attention_mask_from_input_mask(to_mask,
                                           batch_size,
                                           max_seq_length,
                                           dtype=tf.float32):
     to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, max_seq_length]),
                       dtype=dtype)
     broadcast_ones = tf.ones(shape=[batch_size, max_seq_length, 1],
                              dtype=dtype)
     mask = broadcast_ones * to_mask
     return mask
예제 #24
0
    def create_attention_mask_from_input_mask(self,
                                              input_mask,
                                              batch_size,
                                              max_seq_length,
                                              dtype=tf.float32):
        to_mask = tf.cast(tf.reshape(input_mask,
                                     [batch_size, 1, max_seq_length]),
                          dtype=dtype)
        broadcast_ones = tf.ones(shape=[batch_size, max_seq_length, 1],
                                 dtype=dtype)
        mask = broadcast_ones * to_mask

        broadcast_eye = tf.tile(
            tf.reshape(tf.eye(max_seq_length),
                       [1, max_seq_length, max_seq_length]),
            [batch_size, 1, 1])
        mask += broadcast_eye
        mask = tf.cast(tf.greater(mask, 0), dtype)
        return mask
예제 #25
0
파일: crf.py 프로젝트: zhongyunuestc/unif
 def _single_seq_fn():
     batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
     example_inds = tf.reshape(
         tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
     sequence_scores = tf.gather_nd(
         tf.squeeze(inputs, [1]),
         tf.concat([example_inds, tag_indices], axis=1))
     sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
                                tf.zeros_like(sequence_scores),
                                sequence_scores)
     return sequence_scores
예제 #26
0
파일: util.py 프로젝트: llgithubll/unif
def reshape_to_matrix(input_tensor):
    '''Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).'''
    ndims = input_tensor.shape.ndims
    if ndims < 2:
        raise ValueError('Input tensor must have at least rank 2. Shape = %s' %
                         (input_tensor.shape))
    if ndims == 2:
        return input_tensor

    width = input_tensor.shape[-1]
    output_tensor = tf.reshape(input_tensor, [-1, width])
    return output_tensor
예제 #27
0
파일: util.py 프로젝트: llgithubll/unif
def reshape_from_matrix(output_tensor, orig_shape_list, original_shape=None):
    '''Reshapes a rank 2 tensor back to its original rank >= 2 tensor.'''
    if len(orig_shape_list) == 2:
        return output_tensor

    if not original_shape:
        original_shape = get_shape_list(output_tensor)

    orig_dims = orig_shape_list[0:-1]
    width = original_shape[-1]

    return tf.reshape(output_tensor, orig_dims + [width])
예제 #28
0
                def _forward(dilated_ids, dilated_mask):

                    logits = self._bert_forward(
                        bert_config,
                        dilated_ids,
                        dilated_mask,
                        batch_size,
                        dilated_seq_length,
                        tilda_embeddings=tilda_embeddings)
                    output_ids = tf.argmax(logits, axis=-1)
                    output_ids = tf.cast(output_ids, dtype=tf.int32)

                    # special padding (using `spad` token)
                    equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32)
                    equal_zero = tf.reduce_sum(equal_zero, axis=-1)
                    right_pad = spad_id * tf.sequence_mask(
                        equal_zero, dilated_seq_length, dtype=tf.int32)
                    paded = tf.concat([output_ids, right_pad], axis=-1)

                    # extract ids of length `max_seq_length`
                    flattened_padded = tf.reshape(paded, [-1])
                    is_valid = tf.cast(tf.greater(flattened_padded, 0),
                                       dtype=tf.int32)
                    flattened_valid = tf.boolean_mask(flattened_padded,
                                                      is_valid)
                    valid = tf.reshape(flattened_valid,
                                       [batch_size, dilated_seq_length])
                    cutted_valid = valid[:, :max_seq_length]

                    # replace `spad` token with `pad`
                    non_spad_mask = tf.cast(tf.not_equal(
                        cutted_valid, spad_id),
                                            dtype=tf.int32)
                    output_ids = cutted_valid * non_spad_mask
                    output_length = tf.reduce_sum(non_spad_mask, axis=-1)

                    # dilate
                    reshaped_ids = tf.reshape(output_ids,
                                              [batch_size, max_seq_length, 1])
                    reshaped_mask = tf.reshape(
                        tf.sequence_mask(output_length,
                                         max_seq_length,
                                         dtype=tf.int32),
                        [batch_size, max_seq_length, 1])
                    concat_ids = tf.concat(
                        [reshaped_ids,
                         tf.zeros_like(reshaped_ids)], axis=-1)
                    concat_mask = tf.concat([
                        reshaped_mask,
                        tf.zeros_like(reshaped_mask, dtype=tf.int32)
                    ],
                                            axis=-1)
                    dilated_ids = tf.reshape(concat_ids,
                                             [batch_size, max_seq_length * 2])
                    dilated_mask = tf.reshape(concat_mask,
                                              [batch_size, max_seq_length * 2])

                    return dilated_ids, dilated_mask
예제 #29
0
 def _get_embedding_loss(self, teacher, student, bert_config, weights):
     teacher_embedding = teacher.get_embedding_output()
     teacher_embedding = tf.stop_gradient(teacher_embedding)
     student_embedding = student.get_embedding_output()
     with tf.variable_scope('embedding_loss'):
         linear_trans = tf.layers.dense(
             student_embedding,
             bert_config.hidden_size,
             kernel_initializer=util.create_initializer(
                 bert_config.initializer_range))
         embedding_loss = tf.losses.mean_squared_error(
             linear_trans, teacher_embedding,
             weights=tf.reshape(weights, [-1, 1, 1]))
     return embedding_loss
예제 #30
0
파일: unilm.py 프로젝트: zhongyunuestc/unif
    def create_attention_mask_from_input_mask(self,
                                              input_mask,
                                              batch_size,
                                              max_seq_length,
                                              dtype=tf.float32):
        if self._mode == 'bi':
            to_mask = tf.cast(tf.reshape(
                input_mask, [batch_size, 1, max_seq_length]), dtype=dtype)
            broadcast_ones = tf.ones(
                shape=[batch_size, max_seq_length, 1], dtype=dtype)
            mask = broadcast_ones * to_mask

        elif self._mode == 'l2r':
            arange = tf.range(max_seq_length) + 1
            to_mask = tf.cast(tf.sequence_mask(arange, max_seq_length), dtype)
            to_mask = tf.reshape(to_mask, [1, max_seq_length, max_seq_length])
            mask = tf.tile(to_mask, [batch_size, 1, 1])

        elif self._mode == 'r2l':
            to_mask = tf.cast(tf.reshape(
                input_mask, [batch_size, 1, max_seq_length]), dtype=dtype)
            broadcast_ones = tf.ones(
                shape=[batch_size, max_seq_length, 1], dtype=dtype)
            cover_mask = broadcast_ones * to_mask

            arange = tf.range(max_seq_length)
            reverse = tf.cast(tf.sequence_mask(arange, max_seq_length), dtype)
            reverse = tf.reshape(reverse, [1, max_seq_length, max_seq_length])
            reverse_mask = tf.tile(reverse, [batch_size, 1, 1])

            mask = (1 - reverse_mask) * cover_mask

        elif self._mode == 's2s':
            mask = tf.cast(
                tf.sequence_mask(input_mask, max_seq_length), dtype)

        return mask