Exemplo n.º 1
0
def make_is_span_maskable_features(num_blocks_per_example,
                                   block_length, max_num_annotations,
                                   annotation_begins,
                                   annotation_ends,
                                   annotation_labels):
  """Prepares is-token-belongs-to-an-annotation mask."""
  annotation_begins = tf.reshape(annotation_begins,
                                 [num_blocks_per_example, max_num_annotations])
  annotation_ends = tf.reshape(annotation_ends,
                               [num_blocks_per_example, max_num_annotations])
  annotation_labels = tf.reshape(annotation_labels,
                                 [num_blocks_per_example, max_num_annotations])
  annotation_mask = tf.expand_dims(
      tf.cast(tf.not_equal(annotation_labels, 0), tf.int32), -1)

  mask_begin = tf.sequence_mask(annotation_begins, block_length, dtype=tf.int32)
  mask_begin_plus_one = tf.sequence_mask(
      annotation_begins + 1, block_length, dtype=tf.int32)
  mask_end = tf.sequence_mask(annotation_ends + 1, block_length, dtype=tf.int32)

  def make_mask(x):
    x = x * annotation_mask
    x = tf.reduce_sum(x, 1)
    x = tf.minimum(x, 1)
    x = tf.reshape(x, [num_blocks_per_example * block_length])
    return x

  return (make_mask(mask_end - mask_begin),
          make_mask(mask_end - mask_begin_plus_one))
Exemplo n.º 2
0
def tfdata_emb_layer(features):
  """Add learnable unk/pad vectors for inputs embedding lookups are in input.

  By convention, the UNK vector will be all 0.0 (see tf_data_pipeline).
  We replace all UNK tokens with the learned UNK vector

  Args:
    features: Input features for qanet.

  Returns:
    context and question with UNK/PAD tokens replaced.
  """
  xw = features['context_vecs']
  qw = features['question_vecs']
  vec_len = xw.get_shape()[-1]
  with tf.variable_scope('glove_layer'):
    # PAD = 0
    # UNK = 1
    unk_pad = tf.get_variable('glove_emb_mat_var', [2, vec_len])
    pad = unk_pad[0, :]
    unk = unk_pad[1, :]
  q_mask = tf.tile(
      tf.sequence_mask(features['question_num_words'],
                       dtype=tf.float32)[:, :, None], [1, 1, vec_len])
  x_mask = tf.tile(
      tf.sequence_mask(features['context_num_words'],
                       dtype=tf.float32)[:, :, None], [1, 1, vec_len])

  xw = _replace_zeros(xw, unk)
  qw = _replace_zeros(qw, unk)

  # Add learned padding token
  xw = x_mask * xw + (1.0 - x_mask) * pad[None, None, :]
  qw = q_mask * qw + (1.0 - q_mask) * pad[None, None, :]
  return xw, qw
def compute_last_embedding(input_embeddings, input_lengths, hparams):
    """Computes average of last K embedding.

  Args:
    input_embeddings: <tf.float32>[bs, max_seq_len, emb_dim]
    input_lengths: <tf.int64>[bs, 1]
    hparams: model hparams

  Returns:
    last_k_embedding: <tf.float32>[bs, emb_dim]
  """
    max_seq_len = tf.shape(input_embeddings)[1]
    # <tf.float32>[bs, 1, max_seq_len]
    mask = tf.sequence_mask(input_lengths, max_seq_len, dtype=tf.float32)
    del_mask = tf.sequence_mask(input_lengths - hparams.last_k,
                                max_seq_len,
                                dtype=tf.float32)
    final_mask = mask - del_mask
    # <tf.float32>[bs, 1, emb_dim]
    sum_embedding = tf.matmul(final_mask, input_embeddings)
    # <tf.float32>[bs, 1, emb_dim]
    last_k_embedding = sum_embedding / tf.to_float(
        tf.expand_dims(
            tf.ones([tf.shape(input_embeddings)[0], 1]) * hparams.last_k, 2))
    # <tf.float32>[bs, dim]
    return tf.squeeze(last_k_embedding, 1)
Exemplo n.º 4
0
def _oneof_filters_to_int_or_mask(value,
                                  input_filters_or_mask=None,
                                  filters_base=None):
    """Convert a OneOf or int to either an int or a rank-1 float mask."""
    if isinstance(value, schema.OneOf):
        choices = value.choices
        mask = value.mask
    elif isinstance(value, (int, basic_specs.FilterMultiplier)):
        choices = [value]
        mask = None
    else:
        raise ValueError(
            'Must be a OneOf or FilterMultiplier: {}'.format(value))

    # Generate a list of candidate filter sizes. Each filter size can either be
    # an int or a scalar int Tensor.
    scaled_choices = []  # type: List[Union[int, tf.Tensor]]
    for choice in choices:
        scaled_choices.append(
            _compute_filters(choice, input_filters_or_mask, filters_base))

    # Compute the largest possible number of input filters as an integer.
    if input_filters_or_mask is None or isinstance(input_filters_or_mask, int):
        max_input_filters = input_filters_or_mask
    else:
        # input_filters_or_mask must be a tf.Tensor in this case.
        max_input_filters = int(input_filters_or_mask.shape[-1])

    # Compute the largest possible number of output filters as an integer.
    max_output_filters = 0
    for choice in choices:
        # Note: current_filters should always be an integer (rather than a Tensor)
        # because `max_input_filters` is an integer.
        current_filters = _compute_filters(choice, max_input_filters,
                                           filters_base)
        max_output_filters = max(max_output_filters, current_filters)

    # Return an integer (if possible) or a mask (if we can't infer the exact
    # number of filters at graph construction time.
    if len(scaled_choices) == 1:
        selection = scaled_choices[0]  # type: Union[int, tf.Tensor]
        if isinstance(selection, tf.Tensor):
            return tf.sequence_mask(selection,
                                    max_output_filters,
                                    dtype=tf.float32)
        else:
            return selection
    else:
        selection_index = tf.argmax(mask)
        selection = tf.gather(scaled_choices, selection_index)
        return tf.sequence_mask(selection,
                                max_output_filters,
                                dtype=tf.float32)
Exemplo n.º 5
0
def sequence_accuracy(gt_seqs,
                      decode_seqs,
                      gt_seq_lengths,
                      pr_seq_lengths,
                      debug=False,
                      name=""):
    """Computes the complete and the partial sequence accuracy."""
    gt_shape = common_layers.shape_list(gt_seqs)
    pr_shape = common_layers.shape_list(decode_seqs)
    batch_size = gt_shape[0]
    depth = gt_shape[-1]
    gt_len = gt_shape[1]
    pr_len = pr_shape[1]
    max_len = tf.maximum(gt_len, pr_len)
    gt_seqs = tf.pad(gt_seqs, [[0, 0], [0, max_len - gt_len], [0, 0]])
    decode_seqs = tf.pad(decode_seqs, [[0, 0], [0, max_len - pr_len], [0, 0]])
    gt_seqs = tf.where(
        tf.tile(
            tf.expand_dims(tf.sequence_mask(gt_seq_lengths, maxlen=max_len),
                           2), [1, 1, depth]), gt_seqs,
        tf.fill(tf.shape(gt_seqs), -1))
    decode_seqs = tf.where(
        tf.tile(
            tf.expand_dims(tf.sequence_mask(pr_seq_lengths, maxlen=max_len),
                           2), [1, 1, depth]), decode_seqs,
        tf.fill(tf.shape(decode_seqs), -1))
    # [batch_size, decode_length]
    corrects = tf.reduce_all(tf.equal(gt_seqs, decode_seqs), -1)
    correct_mask = tf.reduce_all(corrects, -1)
    # [batch_size]
    if debug:
        incorrect_mask = tf.logical_not(correct_mask)
        incorrect_gt = tf.boolean_mask(gt_seqs, incorrect_mask)
        incorrect_pr = tf.boolean_mask(decode_seqs, incorrect_mask)
        with tf.control_dependencies([
                tf.print(name + "_mismatch",
                         incorrect_gt,
                         incorrect_pr,
                         summarize=1000)
        ]):
            correct_mask = tf.identity(correct_mask)
    correct_seqs = tf.to_float(correct_mask)
    total_correct_seqs = tf.reduce_sum(correct_seqs)
    mean_complete_accuracy = total_correct_seqs / tf.to_float(batch_size)
    # Compute partial accuracy
    errors = tf.logical_not(corrects)
    errors = tf.cast(tf.cumsum(tf.to_float(errors), axis=-1), tf.bool)
    # [batch_size]
    correct_steps = tf.reduce_sum(tf.to_float(tf.logical_not(errors)), axis=-1)
    mean_partial_accuracy = tf.reduce_mean(
        tf.div(tf.minimum(correct_steps, gt_seq_lengths), gt_seq_lengths))
    return mean_complete_accuracy, mean_partial_accuracy
 def do_process_boundary(start_points, end_points, input_length, t1_id,
                         t2_id, all_tokenized_diag):
     """function that contains the majority of the logic to proess boundary."""
     masks_start = tf.sequence_mask(start_points, input_length)
     masks_end = tf.sequence_mask(end_points, input_length)
     xor_masks = tf.logical_xor(masks_start, masks_end)
     mask1 = tf.reduce_any(xor_masks, axis=0)
     mask2 = tf.logical_not(mask1)
     all_turn1 = tf.equal(all_tokenized_diag, t1_id)
     all_turn2 = tf.equal(all_tokenized_diag, t2_id)
     turn_point = tf.logical_or(all_turn1, all_turn2)
     turn_point = tf.cast(turn_point, dtype=tf.float32)
     return mask1, mask2, turn_point
Exemplo n.º 7
0
    def compute_attention_vec(input_vec,
                              att_vecs,
                              lengths,
                              allow_zero_attention=False):
        att_vecs_shape = tf.shape(att_vecs)
        if allow_zero_attention:
            zero_vec = tf.zeros([att_vecs_shape[0], 1, att_vecs_shape[2]],
                                dtype=DATA_TYPE)
            att_vecs_with_zero = tf.concat([att_vecs, zero_vec], 1)
            att_scores, reg_terms = get_attention_scores(
                input_vec, att_vecs_with_zero, model.params['num_heads'],
                reuse, model.params['attention_func'])
            # att_vecs = tf.Print(att_vecs, [att_vecs], 'this is att_scores', summarize=25)
            regularization_terms.extend(reg_terms)

            # length mask
            mask = tf.concat([
                tf.sequence_mask(
                    lengths, maxlen=att_vecs_shape[1], dtype=DATA_TYPE),
                tf.ones([att_vecs_shape[0], 1], dtype=DATA_TYPE)
            ], 1)
            masked_att_exp = tf.exp(att_scores) * mask
            div = tf.reduce_sum(masked_att_exp, 1, True)
            att_dis = masked_att_exp / tf.where(tf.less(div, 1e-7), div + 1,
                                                div)
            model.attention_distribution = att_dis
            return tf.reduce_sum(
                att_vecs_with_zero * tf.expand_dims(att_dis, -1), 1)

        else:
            att_scores, reg_terms = get_attention_scores(
                input_vec, att_vecs, model.params['num_heads'], reuse,
                model.params['attention_func'])
            # att_vecs = tf.Print(att_vecs, [att_vecs], 'this is att_scores', summarize=25)
            regularization_terms.extend(reg_terms)

            # length mask
            mask = tf.sequence_mask(lengths,
                                    maxlen=att_vecs_shape[1],
                                    dtype=DATA_TYPE)
            masked_att_exp = tf.exp(att_scores) * mask
            div = tf.reduce_sum(masked_att_exp, 1, True)
            att_dis = masked_att_exp / tf.where(tf.less(div, 1e-7), div + 1,
                                                div)
            '''
            att_exp = tf.exp(att_scores)
            att_dis = att_exp/(tf.reduce_sum(att_exp, 1, True) + tf.expand_dims(lengths, -1) - model.params.max_history_length)
            '''
            # att_dis = tf.Print(att_dis, [att_dis], 'this is att_dis', summarize=25)
            model.attention_distribution = att_dis
            return tf.reduce_sum(att_vecs * tf.expand_dims(att_dis, -1), 1)
Exemplo n.º 8
0
def _compute_head_weights_with_position_prior(weights, masks, paddings,
                                              num_heads, attn_size):
    """Computes head-specific attention weights with position prior.

  This function simply masks out the weights for items if they don't belong to a
  certain chunk, using a sliding window technique. I.e., head i only focuses on
  ith recent "chunk_size" items with respect to the query. Note that chunks are
  non-overlapping, meaning, sliding window stride is also set to attn_size.

  Args:
    weights: A 3d tensor with shape of [h*N, T_q, T_k].
    masks: A 3d tensor with shape of [h*N, T_q, T_k].
    paddings: A 3d tensor with shape of [h*N, T_q, T_k].
    num_heads: An integer denoting number of chunks.
    attn_size: An integer denoting the size of the sliding window.

  Returns:
    A list of h tensors (each shaped [N, T_q, T_k]) where tensors correspond to
    chunk specific weights.
  """
    # Masks is a lower triangular tensor with ones in the bottom and zeros in the
    # upper section. Since chunks are allocated with respect to query position, we
    # first need to count the available items prior to each query. argmin function
    # would work for this, except the last query because it returns the smallest
    # index in the case of ties. To make sure we have the accurate count for the
    # last query, we first append a zero tensor and call the argmin function.
    max_idxs = tf.argmin(tf.concat([masks, tf.zeros_like(masks)], axis=-1),
                         2)  # (h*N, T_q)

    # Split for heads.
    max_idxs_split = tf.split(max_idxs, num_heads, axis=0)  # (h x (N, T_q))
    weights_split = tf.split(weights, num_heads, axis=0)  # (h x (N, T_q, T_k))
    paddings_split = tf.split(paddings, num_heads,
                              axis=0)  # (h x (N, T_q, T_k))

    # Collects output weights per chunk.
    chunk_outputs_list = []
    for i in range(num_heads):
        mask_left = tf.sequence_mask(
            tf.maximum(max_idxs_split[i] - (attn_size * (i + 1)), 0),
            tf.shape(weights_split[i])[2])  # (N, T_q, T_k)
        mask_right = tf.sequence_mask(
            tf.maximum(max_idxs_split[i] - (attn_size * i), 0),
            tf.shape(weights_split[i])[2])  # (N, T_q, T_k)
        mask = tf.logical_and(tf.logical_not(mask_left),
                              mask_right)  # (N, T_q, T_k)
        # Adjust weights for chunk i.
        output = tf.where(mask, weights_split[i],
                          paddings_split[i])  # (N, T_q, T_k)
        chunk_outputs_list.append(output)
    return chunk_outputs_list  # (h x (N, T_q, T_k))
Exemplo n.º 9
0
def gated_convnet(graph_inputs,
                  batch_size=64,
                  hidden_size=300,
                  depth=3,
                  res_block=2):
    input_atom, input_bond, atom_graph, bond_graph, num_nbs, node_mask = graph_inputs
    layers = [input_atom]
    atom_features = input_atom
    for i in range(depth):
        fatom_nei = tf.gather_nd(atom_features, atom_graph)
        fbond_nei = tf.gather_nd(input_bond, bond_graph)
        f_nei = tf.concat([fatom_nei, fbond_nei], 3)
        h_nei = linearND(f_nei, hidden_size, "nei_hidden_%d" % i)
        g_nei = tf.nn.sigmoid(linearND(f_nei, hidden_size, "nei_gate_%d" % i))
        f_nei = h_nei * g_nei
        mask_nei = tf.reshape(
            tf.sequence_mask(tf.reshape(num_nbs, [-1]),
                             max_nb,
                             dtype=tf.float32), [batch_size, -1, max_nb, 1])
        f_nei = tf.reduce_sum(f_nei * mask_nei, -2)
        h_self = linearND(atom_features, hidden_size, "self_hidden_%d" % i)
        g_self = tf.nn.sigmoid(
            linearND(atom_features, hidden_size, "self_gate_%d" % i))
        f_self = h_self * g_self
        atom_features = (f_nei + f_self) * node_mask
        if res_block is not None and i % res_block == 0 and i > 0:
            atom_features = atom_features + layers[-2]
        layers.append(atom_features)
    output_gate = tf.nn.sigmoid(
        linearND(atom_features, hidden_size, "out_gate"))
    output = node_mask * (output_gate * atom_features)
    fp = tf.reduce_sum(output, 1)
    return atom_features * node_mask, fp
Exemplo n.º 10
0
def mask_logits(vec, mask):
    """Mask `vec` in log-space.

  Elements in `vec` that are not in `mask` will be set to be very
  negative, so that when no longer in log-space (e.i., after `tf.exp(vec)`)
  their values will be very close to zero.

  Args:
    vec: <float32>[...] tensor to mask
    mask: Either a None (in which case this is a no-op), a boolean or 0/1 float
      mask that matches all, or all but the last, dimensions of `vec`, or 1-D
      integer length mask

  Raises:
    ValueError: If `mask` cannot be matched to `vec`
  Returns:
    masked: vec:<float32>[...]
  """
    if mask is None:
        return vec

    if mask.dtype == tf.int32:
        # Assume `mask` holds sequence lengths
        if len(vec.shape) not in [2, 3]:
            raise ValueError("Can't use a length mask on tensor of rank>3")
        mask = tf.sequence_mask(mask, tf.shape(vec)[1], tf.float32)
    else:
        mask = tf.to_float(mask)

    if len(mask.shape) == (len(vec.shape) - 1):
        mask = tf.expand_dims(mask, len(vec.shape) - 1)

    return vec * mask - (1 - mask) * 1E20
Exemplo n.º 11
0
    def LSTM_layer(self, embeddings):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(self.lstm_size)
        zero_state = tf.zeros(shape=(self.batch_size, self.lstm_size))
        initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)

        lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2]))
        lstm_outputs, lstm_state = tf.nn.static_rnn(
            cell=lstm_cell,
            inputs=lstm_inputs,
            initial_state=initial_state,
            sequence_length=self.sentence_lengths
        )  # a length-500 list of [num_docs, lstm_size]

        lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm=[1, 0, 2]))
        lstm_outputs = tf.concat(
            lstm_outputs, axis=0)  # [num_docs * MAX_SENT_LENGTH, lstm_size]

        # self.mask: [num_docs * MAX_SENT_LENGTH, ]
        mask = tf.sequence_mask(
            lengths=self.sentence_lengths,
            maxlen=MAX_DOC_LENGTH,
            dtype=tf.float32)  # [num_docs, MAX_SENT_LENGTH]
        mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
        mask = tf.expand_dims(mask, -1)
        lstm_outputs = mask * lstm_outputs
        lstm_outputs_split = tf.split(lstm_outputs,
                                      num_or_size_splits=self.batch_size)
        lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split,
                                         axis=1)  # [num_docs, lstm_size]
        lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(
            tf.cast(
                self.sentence_lengths,
                tf.float32),  # expand_dims only work with tensor of float type
            -1)  # [num_docs, lstm_size]
        return lstm_outputs_average
Exemplo n.º 12
0
    def call(self,
             hidden_states,
             token_ids=None,
             padding_token_id=None,
             ignore_prefix_length=None,
             training=None):

        if self._intermediate_dense is not None:
            intermediate_outputs = self._intermediate_dense(hidden_states)
            intermediate_outputs = self._intermediate_activation(
                intermediate_outputs)
            outputs = self._output_dense(intermediate_outputs)
            outputs = self._output_dropout(outputs, training=training)
            outputs = self._output_layer_norm(outputs + hidden_states)
        else:
            outputs = hidden_states
        logits = self._logits_dense(outputs)

        if token_ids is not None or padding_token_id is not None:
            if token_ids is None or padding_token_id is None:
                raise ValueError(
                    "Both `token_ids` and `padding_token_id` needs to be "
                    "specified in order to compute mask for logits")
            logits -= tf.expand_dims(
                tf.cast(tf.equal(token_ids, 0), tf.float32), -1) * 1e6

        if ignore_prefix_length is not None:
            seq_length = tf.shape(logits)[-2]
            logits -= tf.expand_dims(
                tf.sequence_mask(ignore_prefix_length,
                                 seq_length,
                                 dtype=tf.float32), -1) * 1e6

        return logits
Exemplo n.º 13
0
    def _build(self, logits, targets, target_lens, normalize_by_length=False):  # pylint: disable=arguments-differ
        """Builds the cross entropy loss.

    Args:
      logits: <float32> [batch_size, seq_len, vocab_size] for predicted logits.
      targets: <int32> [batch_size, seq_len] if `sparse=True` or
        <float32> [batch_size, seq_len, vocab_size] otherwise.
      target_lens: <int32> [batch_size] for the target sequence lengths.
      normalize_by_length: Boolean indicating whether to normalize the loss by
        the sequence length (i.e., shorter sequences are penalized more).

    Returns:
      loss: <float32> [batch_size] for the loss.
    """
        # Build weights.
        weights = tf.sequence_mask(target_lens, dtype=logits.dtype)

        # Build loss.
        if self._sparse:
            loss = self._build_sparse(logits, targets, weights,
                                      normalize_by_length)
        else:
            loss = self._build_dense(logits, targets, weights,
                                     normalize_by_length)

        return loss
Exemplo n.º 14
0
    def build_predictions_layer(self):
        # Assign rnn outputs.
        self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.out_mu, self.out_sigma = self.outputs

        # TODO: Sampling option.
        self.output_sample = self.out_mu
        self.input_sample = self.inputs
        self.output_dim = self.output_sample.shape.as_list()[-1]

        self.ops_evaluation['output_sample'] = self.output_sample
        self.ops_evaluation['p_mu'] = self.p_mu
        self.ops_evaluation['p_sigma'] = self.p_sigma
        self.ops_evaluation['q_mu'] = self.q_mu
        self.ops_evaluation['q_sigma'] = self.q_sigma
        self.ops_evaluation['state'] = self.output_state

        num_entries = tf.cast(
            self.input_seq_length.shape.as_list()[0] *
            tf.reduce_sum(self.input_seq_length), tf.float32)
        self.ops_scalar_summary["mean_out_sigma"] = tf.reduce_sum(
            self.out_sigma) / num_entries
        self.ops_scalar_summary["mean_p_sigma"] = tf.reduce_sum(
            self.p_sigma) / num_entries
        self.ops_scalar_summary["mean_q_sigma"] = tf.reduce_sum(
            self.q_sigma) / num_entries

        # Mask for precise loss calculation.
        self.seq_loss_mask = tf.expand_dims(
            tf.sequence_mask(lengths=self.input_seq_length,
                             maxlen=tf.reduce_max(self.input_seq_length),
                             dtype=tf.float32), -1)
Exemplo n.º 15
0
  def _get_categorical_slot_goals(self, features):
    """Obtain logits for status and values for categorical slots."""
    # Predict the status of all categorical slots.
    slot_embeddings = features["cat_slot_emb"]
    status_logits = self._get_logits(slot_embeddings, 3,
                                     "categorical_slot_status")

    # Predict the goal value.

    # Shape: (batch_size, max_categorical_slots, max_categorical_values,
    # embedding_dim).
    value_embeddings = features["cat_slot_value_emb"]
    _, max_num_slots, max_num_values, embedding_dim = (
        value_embeddings.get_shape().as_list())
    value_embeddings_reshaped = tf.reshape(
        value_embeddings, [-1, max_num_slots * max_num_values, embedding_dim])
    value_logits = self._get_logits(value_embeddings_reshaped, 1,
                                    "categorical_slot_values")
    # Reshape to obtain the logits for all slots.
    value_logits = tf.reshape(value_logits, [-1, max_num_slots, max_num_values])
    # Mask out logits for padded slots and values because they will be
    # softmaxed.
    mask = tf.sequence_mask(
        features["cat_slot_value_num"], maxlen=max_num_values)
    negative_logits = -0.7 * tf.ones_like(value_logits) * value_logits.dtype.max
    value_logits = tf.where(mask, value_logits, negative_logits)
    return status_logits, value_logits
Exemplo n.º 16
0
  def _build_encoder(self, hparams):
    """Build a GNMT encoder."""
    source = self.features["source"]
    source = tf.transpose(source)
    if self.length > 0:
      source = tf.slice(source, [0, 0], [self.length, -1])

    with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
      emb = tf.cast(
          self.encoder_emb_lookup_fn(self.embedding_encoder, source),
          self.dtype)
      seq_len = self.features["source_sequence_length"]
      padding = tf.transpose(
          tf.sequence_mask(seq_len, emb.shape[0], self.dtype))
      max_seq_len = tf.reduce_max(seq_len)
      if self.mode == contrib_learn.ModeKeys.TRAIN:
        emb = emb * dropout(emb.shape, emb.dtype, 1.0 - hparams.dropout)
      out = build_bid_rnn({"rnn": emb}, seq_len, hparams.num_units,
                          "bidirectional_rnn")
      out = out * tf.expand_dims(padding, 2)
      for i in range(3):
        orig_out = out
        if self.mode == contrib_learn.ModeKeys.TRAIN:
          out = out * dropout(out.shape, emb.dtype, 1.0 - hparams.dropout)
        inputs = {"rnn": out}
        o = build_uni_rnn(inputs, max_seq_len, hparams.num_units,
                          "rnn/uni_rnn_cell_%d" % i)
        if i > 0:
          o = o + orig_out
        out = o
      out = out * tf.expand_dims(padding, 2)

    return out
Exemplo n.º 17
0
def rcnn_wl_only(graph_inputs, hidden_size, depth, training=True):
    input_atom, input_bond, atom_graph, bond_graph, num_nbs = graph_inputs
    atom_features = tf.nn.relu(
        linearND(input_atom, hidden_size, "atom_embedding", init_bias=None))
    layers = []
    for i in range(depth):
        with tf.variable_scope("WL", reuse=(i > 0)) as scope:
            fatom_nei = tf.gather_nd(atom_features, atom_graph)
            fbond_nei = tf.gather_nd(input_bond, bond_graph)

            mask_nei = tf.sequence_mask(tf.reshape(num_nbs, [-1]),
                                        max_nb,
                                        dtype=tf.float32)
            target_shape = tf.concat([tf.shape(num_nbs), [max_nb, 1]], 0)
            mask_nei = tf.reshape(mask_nei, target_shape)
            mask_nei.set_shape([None, None, max_nb, 1])

            l_nei = tf.concat([fatom_nei, fbond_nei], 3)
            nei_label = tf.nn.relu(linearND(l_nei, hidden_size, "label_U2"))
            nei_label = tf.reduce_sum(nei_label * mask_nei, -2)
            new_label = tf.concat([atom_features, nei_label], 2)
            new_label = linearND(new_label, hidden_size, "label_U1")
            atom_features = tf.nn.relu(new_label)

    return atom_features
Exemplo n.º 18
0
    def LSTM_layer(self, embeddings):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_size)
        zero_state = tf.zeros(shape=(self._batch_size, self._lstm_size))
        initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)

        lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2]))

        lstm_ouputs, last_state = tf.nn.static_rnn(
            cell=lstm_cell,
            inputs=lstm_inputs,
            initial_state=initial_state,
            sequence_length=self._sentence_lengths)

        lstm_ouputs = tf.unstack(tf.transpose(lstm_ouputs, perm=[1, 0, 2]))

        lstm_ouputs = tf.concat(lstm_ouputs, axis=0)

        mask = tf.sequence_mask(lengths=self._sentence_lengths,
                                maxlen=MAX_DOC_LENGTH,
                                dtype=tf.float32)
        mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
        mask = tf.expand_dims(mask, -1)

        lstm_ouputs = mask * lstm_ouputs
        lstm_ouputs_split = tf.split(lstm_ouputs,
                                     num_or_size_splits=self._batch_size)
        lstm_ouputs_sum = tf.reduce_sum(lstm_ouputs_split, axis=1)
        lstm_ouputs_average = lstm_ouputs_sum / tf.expand_dims(
            tf.cast(self._sentence_lengths, tf.float32), -1)

        return lstm_ouputs_average
Exemplo n.º 19
0
def decode_sequence(features, areas, hparams, decode_length,
                    post_processing=True):
  """Decodes the entire sequence in an auto-regressive way."""
  decode_utils.decode_n_step(seq2act_model.compute_logits,
                             features, areas,
                             hparams, n=decode_length, beam_size=1)
  if post_processing:
    features["input_refs"] = decode_utils.unify_input_ref(
        features["verbs"], features["input_refs"])
    pred_lengths = decode_utils.verb_refs_to_lengths(features["task"],
                                                     features["verb_refs"],
                                                     include_eos=False)
  predicted_actions = tf.concat([
      features["verb_refs"],
      features["obj_refs"],
      features["input_refs"],
      tf.to_int32(tf.expand_dims(features["verbs"], 2)),
      tf.to_int32(tf.expand_dims(features["objects"], 2))], axis=-1)
  if post_processing:
    predicted_actions = tf.where(
        tf.tile(tf.expand_dims(
            tf.sequence_mask(pred_lengths,
                             maxlen=tf.shape(predicted_actions)[1]),
            2), [1, 1, tf.shape(predicted_actions)[-1]]), predicted_actions,
        tf.zeros_like(predicted_actions))
  return predicted_actions
Exemplo n.º 20
0
 def _compute_gradients(self,
                        actions,
                        discounted_rewards,
                        weights=None,
                        sequence_length=None,
                        loss_str='train',
                        use_entropy_regularization=True,
                        **kwargs):
   """Implement the policy gradient in TF."""
   if sequence_length is not None:
     seq_mask = tf.sequence_mask(sequence_length, dtype=tf.float32)
   else:
     seq_mask = None
   with tf.GradientTape(watch_accessed_variables=False) as tape:
     tape.watch(self.trainable_variables)
     # Returns 0.0 if critic is not being used
     value_loss = self._compute_value_loss(
         discounted_rewards, seq_mask=seq_mask, **kwargs)
     policy_loss = self._compute_policy_loss(
         discounted_rewards,
         actions,
         seq_mask=seq_mask,
         weights=weights,
         use_entropy_regularization=use_entropy_regularization,
         **kwargs)
     loss = tf.reduce_mean(policy_loss + value_loss)
   if self.log_summaries and (self._counter % self.log_every == 0):
     contrib_summary.scalar('{}_loss'.format(loss_str), loss)
   return tape.gradient(loss, self.trainable_variables)
Exemplo n.º 21
0
def mask_attention(attention, seq_len1, seq_len2):
    """Masks an attention matrix.

  Args:
    attention: <tf.float32>[batch, seq_len1, seq_len2]
    seq_len1: <tf.int32>[batch]
    seq_len2: <tf.int32>[batch]

  Returns:
    the masked scores <tf.float32>[batch, seq_len1, seq_len2]
  """
    dim1 = tensor_utils.shape(attention, 1)
    dim2 = tensor_utils.shape(attention, 2)
    m1 = tf.sequence_mask(seq_len1, dim1)
    m2 = tf.sequence_mask(seq_len2, dim2)
    joint_mask = tf.logical_and(tf.expand_dims(m1, 2), tf.expand_dims(m2, 1))
    return ops.mask_logits(attention, joint_mask)
Exemplo n.º 22
0
def get_attention_bias(sequence_length):
  """Create attention bias so attention is not applied at padding position."""
  # attention_bias: [batch, 1, 1, memory_length]
  invert_sequence_mask = tf.to_float(tf.logical_not(tf.sequence_mask(
      sequence_length)))
  attention_bias = common_attention.attention_bias_ignore_padding(
      invert_sequence_mask)
  return attention_bias
Exemplo n.º 23
0
    def reduced_by_transformer(self,
                               is_training,
                               num_transformer_layers=2,
                               CLS_ID=102,
                               use_passage_pos_embedding=False):
        bert_config = self.bert_config
        output_layer = self.output_layer
        model = self.model
        embeddings = model.get_embedding_table()
        # clsid_tf = tf.constant([CLS_ID], dtype=tf.int32, name="clsid_tf")
        clsid_tf = tf.Variable([CLS_ID],
                               dtype=tf.int32,
                               trainable=False,
                               name='clsid_tf')
        cls_embedding = tf.nn.embedding_lookup(embeddings, clsid_tf)
        cls_embedding_tiled = tf.tile(cls_embedding,
                                      multiples=[self.batch_size, 1])  # [B, H]
        merged_output = tf.concat(
            (tf.expand_dims(cls_embedding_tiled, axis=1), output_layer),
            axis=1)  # [B, N + 1, H]
        if use_passage_pos_embedding:
            with tf.variable_scope(self.scope):
                full_position_embeddings = tf.get_variable(
                    name="passage_position_embedding",
                    shape=[self.max_num_segments_perdoc + 1, self.hidden_size],
                    initializer=self.modeling.create_initializer(0.02))
            full_position_embeddings = tf.expand_dims(full_position_embeddings,
                                                      axis=0)
            merged_output += full_position_embeddings

        # here comes the Transformer.
        attention_mask = tf.sequence_mask(self.num_segments + 1,
                                          self.max_num_segments_perdoc + 1,
                                          dtype=tf.float32)
        attention_mask = tf.tile(tf.expand_dims(attention_mask, axis=1),
                                 [1, self.max_num_segments_perdoc + 1, 1])
        with tf.variable_scope(self.scope):
            with tf.variable_scope("parade_transformer"):
                if not is_training:
                    bert_config.hidden_dropout_prob = 0.0
                    bert_config.attention_probs_dropout_prob = 0.0
                output_layer, _ = self.modeling.transformer_model(
                    input_tensor=merged_output,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=num_transformer_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.
                    attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=False)  # [B, N + 1, H]
                output_layer = tf.squeeze(output_layer[:, 0:1, :],
                                          axis=1)  # [B, H]

        return output_layer
 def get_data(self):
   x = tf.random_normal(
       (BATCH_SIZE, TARGET_LENGTH, N_CHANNELS), dtype=DTYPE)
   x_lengths = np.random.randint(
       low=1, high=TARGET_LENGTH+1, size=BATCH_SIZE)
   x_lengths = np.ceil(x_lengths / 4.0) * 4.0
   x_lengths = x_lengths.astype(int)
   x_mask = tf.sequence_mask(x_lengths, maxlen=TARGET_LENGTH, dtype=DTYPE)
   return x, x_mask, x_lengths
Exemplo n.º 25
0
    def _attention_unit(self, queries, keys, keys_len):
        if self.use_tf_attention:
            query_masks = tf.cast(
                tf.ones_like(tf.reshape(self.user_interacted_len, [-1, 1])),
                dtype=tf.bool
            )
            key_masks = tf.sequence_mask(
                self.user_interacted_len, self.max_seq_len
            )
            queries = tf.expand_dims(queries, axis=1)
            attention = tf.keras.layers.Attention(use_scale=False)
            pooled_outputs = attention(inputs=[queries, keys],
                                       mask=[query_masks, key_masks])
            return pooled_outputs
        else:
            # queries: B * K, keys: B * seq * K
            queries = tf.expand_dims(queries, axis=1)
            # B * seq * K
            queries = tf.tile(queries, [1, self.max_seq_len, 1])
            queries_keys_cross = tf.concat(
                [queries, keys, queries - keys, queries * keys], axis=2)
            mlp_layer = dense_nn(queries_keys_cross, (16,), use_bn=False,
                                 activation=tf.nn.sigmoid, name="attention")
            # B * seq * 1
            mlp_layer = tf.layers.dense(mlp_layer, units=1, activation=None)
            # attention_weights = tf.transpose(mlp_layer, [0, 2, 1])
            attention_weights = tf.layers.flatten(mlp_layer)

            key_masks = tf.sequence_mask(keys_len, self.max_seq_len)
            paddings = tf.ones_like(attention_weights) * (-2**32 + 1)
            attention_scores = tf.where(key_masks, attention_weights, paddings)
            attention_scores = tf.div_no_nan(
                attention_scores,
                tf.sqrt(
                    tf.cast(keys.get_shape().as_list()[-1], tf.float32)
                )
            )
            # B * 1 * seq
            attention_scores = tf.expand_dims(
                tf.nn.softmax(attention_scores), 1)
            # B * 1 * K
            pooled_outputs = attention_scores @ keys
            return pooled_outputs
Exemplo n.º 26
0
def b2a_attention(logits, a, mask_a=None):
  """Context-to-query attention."""
  if len(mask_a.get_shape()) == 1:
    mask_a = tf.sequence_mask(mask_a, tf.shape(a)[1])
  if len(mask_a.get_shape()) == 2:
    mask_a = tf.expand_dims(mask_a, 1)
  logits = exp_mask(logits, mask_a, mask_is_length=False)
  probabilities = tf.nn.softmax(logits)  # [bs,len_b,len_a]
  b2a = tf.matmul(probabilities, a)  # [bs, len_b, d]
  return b2a
Exemplo n.º 27
0
def build_bid_rnn(inputs, seq_len, num_units, name):
  """Build the bi-directional RNN."""
  max_seq_len = tf.reduce_max(seq_len)
  fwd = build_uni_rnn(inputs, max_seq_len, num_units,
                      name + "/fw/cell_fn/basic_lstm_cell", False)
  bwd_inputs = {k: inputs[k] for k in inputs}
  bwd_inputs["padding"] = tf.transpose(
      tf.sequence_mask(seq_len, inputs["rnn"].shape[0], inputs["rnn"].dtype))
  bwd = build_uni_rnn(bwd_inputs, max_seq_len, num_units,
                      name + "/bw/cell_fn/basic_lstm_cell", True)
  return tf.concat([fwd, bwd], -1)
Exemplo n.º 28
0
 def get_data(self):
     x = tf.random_normal((BATCH_SIZE, TARGET_LENGTH, N_CHANNELS),
                          mean=0.0,
                          stddev=1.0)
     x_lengths = np.random.randint(low=1,
                                   high=TARGET_LENGTH + 1,
                                   size=BATCH_SIZE)
     x_mask = tf.sequence_mask(x_lengths,
                               maxlen=TARGET_LENGTH,
                               dtype=tf.float32)
     return x, x_mask
Exemplo n.º 29
0
    def build_predictions_layer(self):
        # Assign rnn outputs.
        if self.use_temporal_latent_space and self.use_variational_pi:
            self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.gmm_z, self.q_pi, self.p_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs
        elif self.use_temporal_latent_space:
            self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.gmm_z, self.q_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs
        elif self.use_variational_pi:
            self.gmm_z, self.q_pi, self.p_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs

        # TODO: Sampling option.
        self.output_sample = tf.concat(
            [self.out_mu, tf.round(self.out_pen)], axis=2)
        self.input_sample = self.inputs
        self.output_dim = self.output_sample.shape.as_list()[-1]

        # For analysis.
        self.norm_p_mu = tf.norm(self.p_mu, axis=-1)
        self.norm_p_sigma = tf.norm(self.p_sigma, axis=-1)
        self.norm_q_mu = tf.norm(self.q_mu, axis=-1)
        self.norm_q_sigma = tf.norm(self.q_sigma, axis=-1)
        self.norm_out_mu = tf.norm(self.out_mu, axis=-1)
        self.norm_out_sigma = tf.norm(self.out_sigma, axis=-1)

        self.ops_evaluation['output_sample'] = self.output_sample
        if self.use_temporal_latent_space:
            self.ops_evaluation['p_mu'] = self.p_mu
            self.ops_evaluation['p_sigma'] = self.p_sigma
            self.ops_evaluation['q_mu'] = self.q_mu
            self.ops_evaluation['q_sigma'] = self.q_sigma
        if self.use_variational_pi:
            self.ops_evaluation['p_pi'] = tf.nn.softmax(self.p_pi, axis=-1)
        self.ops_evaluation['q_pi'] = tf.nn.softmax(self.q_pi, axis=-1)

        self.ops_evaluation['gmm_z'] = self.gmm_z
        self.ops_evaluation['state'] = self.output_state
        self.ops_evaluation['out_eoc'] = self.out_eoc

        # In case we want to draw samples from output distribution instead of using mean.
        self.ops_evaluation['out_mu'] = self.out_mu
        self.ops_evaluation['out_sigma'] = self.out_sigma
        self.ops_evaluation['out_rho'] = self.out_rho
        self.ops_evaluation['out_pen'] = self.out_pen

        # Visualize average gmm sigma values.
        if self.is_gmm_active:
            self.ops_scalar_summary["mean_gmm_sigma"] = tf.reduce_mean(
                self.gmm_sigma)

        # Sequence mask for precise loss calculation.
        self.seq_loss_mask = tf.expand_dims(
            tf.sequence_mask(lengths=self.input_seq_length,
                             maxlen=tf.reduce_max(self.input_seq_length),
                             dtype=tf.float32), -1)
Exemplo n.º 30
0
 def construct_lmcost(self, input_tensor_fw, input_tensor_bw,
                      sentence_lengths, target_ids, lmcost_type, name):
     with tf.variable_scope(name):
         lmcost_max_vocab_size = min(len(self.word2id),
                                     self.config["lmcost_max_vocab_size"])
         target_ids = tf.where(
             tf.greater_equal(target_ids, lmcost_max_vocab_size - 1),
             x=(lmcost_max_vocab_size - 1) + tf.zeros_like(target_ids),
             y=target_ids)
         cost = 0.0
         if lmcost_type == "separate":
             lmcost_fw_mask = tf.sequence_mask(
                 sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, 1:]
             lmcost_bw_mask = tf.sequence_mask(
                 sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, :-1]
             lmcost_fw = self._construct_lmcost(input_tensor_fw[:, :-1, :],
                                                lmcost_max_vocab_size,
                                                lmcost_fw_mask,
                                                target_ids[:, 1:],
                                                name=name + "_fw")
             lmcost_bw = self._construct_lmcost(input_tensor_bw[:, 1:, :],
                                                lmcost_max_vocab_size,
                                                lmcost_bw_mask,
                                                target_ids[:, :-1],
                                                name=name + "_bw")
             cost += lmcost_fw + lmcost_bw
         elif lmcost_type == "joint":
             joint_input_tensor = tf.concat(
                 [input_tensor_fw[:, :-2, :], input_tensor_bw[:, 2:, :]],
                 axis=-1)
             lmcost_mask = tf.sequence_mask(
                 sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, 1:-1]
             cost += self._construct_lmcost(joint_input_tensor,
                                            lmcost_max_vocab_size,
                                            lmcost_mask,
                                            target_ids[:, 1:-1],
                                            name=name + "_joint")
         else:
             raise ValueError("Unknown lmcost_type: " + str(lmcost_type))
         return cost