예제 #1
0
def _bert_embeddings(wordpiece_embedding_size, bert_config, features,
                     is_training, use_one_hot_embeddings, scope,
                     use_segment_ids):
    """Get embeddings from BERT."""
    token_type_ids = None
    if use_segment_ids:
        token_type_ids = features[constants.SEGMENT_ID_KEY]

    max_seq_len = tf.shape(features[constants.SOURCE_WORDPIECES_KEY])[1]
    input_mask = bert_utils.get_input_mask(max_seq_len,
                                           features[constants.SOURCE_LEN_KEY])
    input_ids = features[constants.SOURCE_WORDPIECES_KEY]
    source_embeddings = bert_utils.get_bert_embeddings(
        input_ids,
        bert_config,
        input_mask,
        token_type_ids=token_type_ids,
        is_training=is_training,
        use_one_hot_embeddings=use_one_hot_embeddings,
        scope=scope)
    source_embeddings = common_layers.linear_transform(
        source_embeddings, wordpiece_embedding_size, "bert_transform")

    # Set weights to ignore padding.
    embedded_weights = tf.to_float(
        tf.not_equal(input_ids, constants.PAD_SYMBOL_ID))
    embedded_weights = tf.expand_dims(embedded_weights, -1)
    return source_embeddings * embedded_weights
예제 #2
0
def _transformer_body(
    input_embeddings,
    source_len,
    target_decode_steps,
    mode,
    model_config,
    output_vocab_size,
    output_vocab_embeddings_table,
    input_copy_mask=None,
):
    """Build a Transformer.

  Args:
    input_embeddings: The embeddings of the input to the Transformer.
    source_len: The length of the input utterance.
    target_decode_steps: Number of steps to generate.
    mode: Mode of the model.
    model_config: Model configuration.
    output_vocab_size: Size of the output vocabulary.
    output_vocab_embeddings_table: Table containing embeddings of output tokens.
    input_copy_mask: Mask on the input utterance for which tokens can be copied.

  Returns:
    Outputs of the transformer.
  """
    # Just apply a simple linear layer here
    encoder_output = common_layers.linear_transform(
        input_embeddings,
        output_size=model_config.model_parameters.encoder_dims,
        scope="bert_to_transformer",
    )

    target_embeddings = _get_target_embeddings(
        input_embeddings,
        output_vocab_embeddings_table,
        target_decode_steps,
        model_config,
    )
    decoder_output = _build_transformer_decoder(encoder_output, source_len,
                                                target_embeddings, mode,
                                                model_config)
    logits = _get_action_logits(
        encoder_output,
        decoder_output,
        output_vocab_embeddings_table,
        output_vocab_size,
        model_config,
        input_copy_mask=input_copy_mask,
    )
    return logits
예제 #3
0
def _get_target_embeddings(encoder_input, output_vocab_embeddings_table,
                           decode_steps, model_config):
    """Get target embeddings from either output table or copied from input.

  Args:
    encoder_input: Tensor representing encoder output of shape (batch size,
      input length, encoder dims).
    output_vocab_embeddings_table: Embeddings for output vocabulary of shape
      (output_vocab_size, target embedding dims).
    decode_steps: DecodeSteps tuple with tensors of shape (batch size, # steps).
    model_config: ModelConfig proto.

  Returns:
    Tensor of shape (batch_size, # steps, target embedding dims) representing
    unnormalized logits for both copy and generate actions.
  """
    input_length = tf.shape(encoder_input)[1]
    # Size of one_hot is (batch size, # steps, input length).
    one_hot = tf.one_hot(decode_steps.action_ids, input_length)
    # Size of encoder_dims is (batch size, input length, encoder dims).
    # Size of matrix multiplication is then (batch size, # steps, encoder_dims).
    copy_embeddings = tf.matmul(one_hot, encoder_input)

    # Need a linear transformation to ensure copy embeddings are right size.
    # Shape will then be (batch size, # steps, target_embedding_dims)
    copy_embeddings = common_layers.linear_transform(
        copy_embeddings,
        model_config.model_parameters.target_embedding_dims,
        "copy_embeddings_transform",
    )
    # Simply get the generate embeddings from the output vocab table.

    generate_steps = tf.equal(
        decode_steps.action_types,
        tf.constant(constants.GENERATE_ACTION, dtype=tf.int64),
    )
    generate_embeddings = common_layers.embedding_lookup(
        output_vocab_embeddings_table,
        decode_steps.action_ids * tf.to_int64(generate_steps),
    )
    # For a given step, only use either copy OR generate embeddings.
    copy_steps = tf.equal(decode_steps.action_types,
                          tf.constant(constants.COPY_ACTION, dtype=tf.int64))

    copy_mask = tf.to_float(tf.expand_dims(copy_steps, axis=-1))
    generate_mask = tf.to_float(tf.expand_dims(generate_steps, axis=-1))
    target_embeddings = (copy_embeddings * copy_mask +
                         generate_embeddings * generate_mask)

    return target_embeddings
예제 #4
0
def _greedy_decode(input_embeddings,
                   output_vocab_size,
                   target_end_id,
                   target_start_id,
                   output_vocab_embeddings_table,
                   source_len,
                   model_config,
                   mode,
                   input_copy_mask=None,
                   clean_output_mask=None):
    """Fast decoding."""
    encoder_output = common_layers.linear_transform(
        input_embeddings,
        output_size=model_config.model_parameters.encoder_dims,
        scope="bert_to_transformer")

    decode_length = model_config.data_options.max_decode_length

    # Expand the inputs in to the beam width.
    def symbols_to_logits_fn(logit_indices, current_index):
        """Go from targets to logits."""
        logit_indices = tf.expand_dims(logit_indices, 0)
        decode_steps = decode_utils.get_decode_steps(logit_indices,
                                                     output_vocab_size,
                                                     model_config)
        target_embeddings = _get_target_embeddings(
            input_embeddings, output_vocab_embeddings_table, decode_steps,
            model_config)
        decoder_output = _build_transformer_decoder(
            encoder_output,
            source_len,
            target_embeddings,
            mode,
            model_config,
            single_step_index=current_index)

        logits = _get_action_logits(encoder_output,
                                    decoder_output,
                                    output_vocab_embeddings_table,
                                    output_vocab_size,
                                    model_config,
                                    input_copy_mask=input_copy_mask,
                                    clean_output_mask=clean_output_mask)

        # Squeeze batch dimension and length dimension, as both should be 1.
        logits = tf.squeeze(logits, axis=[0, 1])
        # Shape of logits should now be (output_vocab_size).
        return logits

    def loop_cond(i, decoded_ids, unused_logprobs):
        """Loop conditional that returns false to stop loop."""
        return tf.logical_and(
            tf.reduce_all(tf.not_equal(decoded_ids, target_end_id)),
            tf.less(i, decode_length))

    def inner_loop(i, decoded_ids, logprobs):
        """Decoder function invoked on each while loop iteration."""
        logits = symbols_to_logits_fn(decoded_ids, i)
        next_id = tf.argmax(logits, axis=0)
        softmax = tf.nn.softmax(logits)
        extended_vocab_size = tf.shape(softmax)[-1]
        mask = tf.one_hot(next_id, extended_vocab_size)
        prob = tf.reduce_sum(softmax * mask)
        logprob = tf.log(prob)

        # Add one-hot values to output Tensors, since values at index > i+1 should
        # still be zero.
        logprobs += tf.one_hot(i + 1,
                               decode_length + 1,
                               on_value=logprob,
                               dtype=tf.float32)
        decoded_ids += tf.one_hot(i + 1,
                                  decode_length + 1,
                                  on_value=next_id,
                                  dtype=tf.int64)

        return i + 1, decoded_ids, logprobs

    initial_ids = tf.zeros(dtype=tf.int64, shape=[decode_length + 1])
    initial_ids += tf.one_hot(0,
                              decode_length + 1,
                              on_value=tf.cast(target_start_id, tf.int64))
    initial_logprob = tf.zeros(dtype=tf.float32, shape=[decode_length + 1])
    initial_i = tf.constant(0)

    initial_values = [initial_i, initial_ids, initial_logprob]

    _, decoded_ids, logprobs = tf.while_loop(loop_cond, inner_loop,
                                             initial_values)

    # Remove <START> symbol.
    decoded_ids = decoded_ids[1:]
    logprobs = logprobs[1:]
    # Sum logprobs to get scores for overall sequence.
    logprobs = tf.reduce_sum(logprobs, axis=0)

    # Expand decoded_ids and logprobs to reflect beam width dimension of 1.
    decoded_ids = tf.expand_dims(decoded_ids, 0)
    logprobs = tf.expand_dims(logprobs, 0)

    # This is the output dict that the function returns.
    output_decode_steps = decode_utils.get_decode_steps(
        decoded_ids, output_vocab_size, model_config)
    predictions = decode_utils.get_predictions(output_decode_steps)
    predictions[constants.SCORES_KEY] = logprobs

    return predictions
예제 #5
0
def _beam_decode(input_embeddings,
                 alpha,
                 output_vocab_size,
                 target_end_id,
                 target_start_id,
                 output_vocab_embeddings_table,
                 source_len,
                 model_config,
                 mode,
                 beam_size,
                 input_copy_mask=None,
                 clean_output_mask=None):
    """Beam search decoding."""
    # Assume batch size is 1.
    batch_size = 1
    encoder_output = common_layers.linear_transform(
        input_embeddings,
        output_size=model_config.model_parameters.encoder_dims,
        scope="bert_to_transformer")

    decode_length = model_config.data_options.max_decode_length

    # Expand decoder inputs to the beam width.
    input_embeddings = tf.tile(input_embeddings, [beam_size, 1, 1])
    encoder_output = tf.tile(encoder_output, [beam_size, 1, 1])

    def symbols_to_logits_fn(current_index, logit_indices):
        """Go from targets to logits.

    Args:
      current_index: Integer corresponding to 0-indexed decoder step.
      logit_indices: Tensor of shape [batch_size * beam_width, decode_length +
        1] to input to decoder.

    Returns:
      Tensor of shape [batch_size * beam_width, output_vocab_size] representing
      logits for the current decoder step.

    Raises:
      ValueError if inputs do not have static length.
    """
        decode_steps = decode_utils.get_decode_steps(logit_indices,
                                                     output_vocab_size,
                                                     model_config)
        target_embeddings = _get_target_embeddings(
            input_embeddings, output_vocab_embeddings_table, decode_steps,
            model_config)
        decoder_output = _build_transformer_decoder(
            encoder_output,
            source_len,
            target_embeddings,
            mode,
            model_config,
            single_step_index=current_index)
        logits = _get_action_logits(encoder_output,
                                    decoder_output,
                                    output_vocab_embeddings_table,
                                    output_vocab_size,
                                    model_config,
                                    input_copy_mask=input_copy_mask,
                                    clean_output_mask=clean_output_mask)
        # Squeeze length dimension, as it should be 1.
        logits = tf.squeeze(logits, axis=[1])
        # Shape of logits should now be:
        # [batch_size * beam_width, output_vocab_size].
        return logits

    initial_ids = tf.ones([batch_size], dtype=tf.int32) * target_start_id
    # ids has shape: [batch_size, beam_size, decode_length]
    # scores has shape: [batch_size, beam_size]
    decode_length = model_config.data_options.max_decode_length
    source_length = input_embeddings.get_shape()[1]

    if source_length.value is None:
        # Fall back on using dynamic shape information.
        source_length = tf.shape(input_embeddings)[1]
    extended_vocab_size = output_vocab_size + source_length
    ids, scores = beam_search.beam_search(symbols_to_logits_fn, initial_ids,
                                          beam_size, decode_length,
                                          extended_vocab_size, alpha,
                                          target_end_id, batch_size)
    # Remove start symbol from returned predicted IDs.
    predicted_ids = ids[:, :, 1:]
    # Since batch size is expected to be 1, squeeze the batch dimension.
    predicted_ids = tf.squeeze(predicted_ids, axis=[0])
    scores = tf.squeeze(scores, axis=[0])
    # This is the output dict that the function returns.
    output_decode_steps = decode_utils.get_decode_steps(
        predicted_ids, output_vocab_size, model_config)
    predictions = decode_utils.get_predictions(output_decode_steps)
    predictions[constants.SCORES_KEY] = scores
    return predictions
예제 #6
0
def _get_action_logits(encoder_output,
                       decoder_output,
                       output_vocab_embeddings_table,
                       output_vocab_size,
                       model_config,
                       input_copy_mask=None,
                       clean_output_mask=None,
                       use_gating_mechanism=True):
    """Generate output logits given decoder output.

  This effectively combines a Pointer Network (Vinyals et al., 2015) with a
  standard softmax output layer for selecting symbols from an output vocabulary,
  similar to:
      - Jia and Liang, 2016 (https://arxiv.org/abs/1606.03622)
      - Gulcehre et al., 2016 (https://arxiv.org/abs/1603.08148)
      - Gu et al., 2016 (https://arxiv.org/abs/1603.06393)
      - See et al. 2017 (https://arxiv.org/abs/1704.04368)

  Args:
    encoder_output: Tensor representing encoder output of shape (batch size,
      input length, encoder dims).
    decoder_output: Tensor representing decoder output of shape (batch size, #
      decoded steps, decoder dims).
    output_vocab_embeddings_table: Embeddings for output vocabulary of shape
      (output_vocab_size, target embedding dims).
    output_vocab_size: Integer size of output_vocab_embeddings_table outer dim.
    model_config: ModelConfig proto.
    input_copy_mask: Mask of the input sequence for copying.
    clean_output_mask: Mask of the output vocab. For clean
      inference only.
    use_gating_mechanism: Whether to use gating mechanism.

  Returns:
    Tensor of shape (batch_size, output_vocab_size + input length) representing
    unnormalized logits for both copy and generate actions.
  """

    with tf.variable_scope("logits_transforms"):
        decoder_dims = decoder_output.get_shape()[-1]
        target_embedding_dims = model_config.model_parameters.target_embedding_dims

        # Dot product the decoder output with representations of each of the output
        # symbols to get a set of unnormalized logits for each output vocab item.
        # We need to tile the output vocab embeddings across the batch.
        output_vocab_transform = tf.expand_dims(output_vocab_embeddings_table,
                                                0)
        batch_size = tf.shape(decoder_output)[0]
        output_vocab_transform = tf.tile(output_vocab_transform,
                                         [batch_size, 1, 1])
        # Transform representations to the target_embedding_dims.
        if decoder_dims != target_embedding_dims:
            transformed_decoder_output = common_layers.linear_transform(
                decoder_output, target_embedding_dims, "decoder_transform")
        else:
            transformed_decoder_output = decoder_output
        generate_logits = tf.matmul(transformed_decoder_output,
                                    output_vocab_transform,
                                    transpose_b=True)
        generate_logits_bias = tf.get_variable("generate_logits_bias",
                                               shape=(output_vocab_size))
        generate_logits += generate_logits_bias

        # Dot product the decoder output with representations from the encoder
        # output.
        # This is necessary vs. re-using the encoder-decoder attention weights
        # because those use multihead attention.
        # First, need to transform representations to the decoder dimensions.
        transformed_encoder_output = common_layers.linear_transform(
            encoder_output, decoder_dims, "encoder_transform")

        copy_logits = tf.matmul(decoder_output,
                                transformed_encoder_output,
                                transpose_b=True)
        # This contains scores representing the probability of copying from input
        # (3rd dim) to output (2nd dim).

        # Optionally apply a soft gating mechanism to determine whether
        # to select from copy or generate logits.
        # TODO(petershaw): Evaluate and improve this gating mechanism.
        # The current implementation is most likely not optimal, since it applies
        # a scalar in the range [0,1] prior to softmax.
        if use_gating_mechanism:
            prob_gen_unnormalized = common_layers.linear_transform(
                decoder_output, 1, "prob_gen")
            prob_gen_bias = tf.get_variable("prob_gen_bias", shape=(1))
            prob_gen_unnormalized += prob_gen_bias
            prob_gen = tf.sigmoid(prob_gen_unnormalized)
            # Squeeze so that prob_gen has shape [batch_size, decode_length]
            prob_gen = tf.squeeze(prob_gen, axis=2)

            # These are the 'generate' logits so are scaled by P_gen.
            generate_logits *= tf.expand_dims(prob_gen, axis=-1)
            # These are the 'copy' logits so are scaled by 1 - P_gen.
            copy_logits *= tf.expand_dims(1 - prob_gen, axis=-1)

        if clean_output_mask is not None:
            clean_mask = (1 - tf.dtypes.cast(
                clean_output_mask, dtype=tf.dtypes.float32)) * LOGIT_MASK_VALUE

            batch_size = common_layers.get_shape_list(generate_logits)[0]
            output_vocab_size = common_layers.get_shape_list(
                generate_logits)[-1]

            clean_mask = tf.reshape(tf.tile(clean_mask, [batch_size]),
                                    [batch_size, output_vocab_size])
            generate_logits += tf.expand_dims(clean_mask, axis=1)

        if input_copy_mask is not None:
            copy_mask = (1 - tf.dtypes.cast(
                input_copy_mask, dtype=tf.dtypes.float32)) * LOGIT_MASK_VALUE
            copy_logits += tf.expand_dims(copy_mask, axis=1)

        # Concatenate logits into a single vector; first N (fixed) inputs are the
        # generation probabilities, and next are the copy probabilities for each
        # input (well, they aren't really probabilities, but scores.)
        extended_logits = tf.concat([generate_logits, copy_logits], axis=2)
        return extended_logits