def make_is_span_maskable_features(num_blocks_per_example, block_length, max_num_annotations, annotation_begins, annotation_ends, annotation_labels): """Prepares is-token-belongs-to-an-annotation mask.""" annotation_begins = tf.reshape(annotation_begins, [num_blocks_per_example, max_num_annotations]) annotation_ends = tf.reshape(annotation_ends, [num_blocks_per_example, max_num_annotations]) annotation_labels = tf.reshape(annotation_labels, [num_blocks_per_example, max_num_annotations]) annotation_mask = tf.expand_dims( tf.cast(tf.not_equal(annotation_labels, 0), tf.int32), -1) mask_begin = tf.sequence_mask(annotation_begins, block_length, dtype=tf.int32) mask_begin_plus_one = tf.sequence_mask( annotation_begins + 1, block_length, dtype=tf.int32) mask_end = tf.sequence_mask(annotation_ends + 1, block_length, dtype=tf.int32) def make_mask(x): x = x * annotation_mask x = tf.reduce_sum(x, 1) x = tf.minimum(x, 1) x = tf.reshape(x, [num_blocks_per_example * block_length]) return x return (make_mask(mask_end - mask_begin), make_mask(mask_end - mask_begin_plus_one))
def tfdata_emb_layer(features): """Add learnable unk/pad vectors for inputs embedding lookups are in input. By convention, the UNK vector will be all 0.0 (see tf_data_pipeline). We replace all UNK tokens with the learned UNK vector Args: features: Input features for qanet. Returns: context and question with UNK/PAD tokens replaced. """ xw = features['context_vecs'] qw = features['question_vecs'] vec_len = xw.get_shape()[-1] with tf.variable_scope('glove_layer'): # PAD = 0 # UNK = 1 unk_pad = tf.get_variable('glove_emb_mat_var', [2, vec_len]) pad = unk_pad[0, :] unk = unk_pad[1, :] q_mask = tf.tile( tf.sequence_mask(features['question_num_words'], dtype=tf.float32)[:, :, None], [1, 1, vec_len]) x_mask = tf.tile( tf.sequence_mask(features['context_num_words'], dtype=tf.float32)[:, :, None], [1, 1, vec_len]) xw = _replace_zeros(xw, unk) qw = _replace_zeros(qw, unk) # Add learned padding token xw = x_mask * xw + (1.0 - x_mask) * pad[None, None, :] qw = q_mask * qw + (1.0 - q_mask) * pad[None, None, :] return xw, qw
def compute_last_embedding(input_embeddings, input_lengths, hparams): """Computes average of last K embedding. Args: input_embeddings: <tf.float32>[bs, max_seq_len, emb_dim] input_lengths: <tf.int64>[bs, 1] hparams: model hparams Returns: last_k_embedding: <tf.float32>[bs, emb_dim] """ max_seq_len = tf.shape(input_embeddings)[1] # <tf.float32>[bs, 1, max_seq_len] mask = tf.sequence_mask(input_lengths, max_seq_len, dtype=tf.float32) del_mask = tf.sequence_mask(input_lengths - hparams.last_k, max_seq_len, dtype=tf.float32) final_mask = mask - del_mask # <tf.float32>[bs, 1, emb_dim] sum_embedding = tf.matmul(final_mask, input_embeddings) # <tf.float32>[bs, 1, emb_dim] last_k_embedding = sum_embedding / tf.to_float( tf.expand_dims( tf.ones([tf.shape(input_embeddings)[0], 1]) * hparams.last_k, 2)) # <tf.float32>[bs, dim] return tf.squeeze(last_k_embedding, 1)
def _oneof_filters_to_int_or_mask(value, input_filters_or_mask=None, filters_base=None): """Convert a OneOf or int to either an int or a rank-1 float mask.""" if isinstance(value, schema.OneOf): choices = value.choices mask = value.mask elif isinstance(value, (int, basic_specs.FilterMultiplier)): choices = [value] mask = None else: raise ValueError( 'Must be a OneOf or FilterMultiplier: {}'.format(value)) # Generate a list of candidate filter sizes. Each filter size can either be # an int or a scalar int Tensor. scaled_choices = [] # type: List[Union[int, tf.Tensor]] for choice in choices: scaled_choices.append( _compute_filters(choice, input_filters_or_mask, filters_base)) # Compute the largest possible number of input filters as an integer. if input_filters_or_mask is None or isinstance(input_filters_or_mask, int): max_input_filters = input_filters_or_mask else: # input_filters_or_mask must be a tf.Tensor in this case. max_input_filters = int(input_filters_or_mask.shape[-1]) # Compute the largest possible number of output filters as an integer. max_output_filters = 0 for choice in choices: # Note: current_filters should always be an integer (rather than a Tensor) # because `max_input_filters` is an integer. current_filters = _compute_filters(choice, max_input_filters, filters_base) max_output_filters = max(max_output_filters, current_filters) # Return an integer (if possible) or a mask (if we can't infer the exact # number of filters at graph construction time. if len(scaled_choices) == 1: selection = scaled_choices[0] # type: Union[int, tf.Tensor] if isinstance(selection, tf.Tensor): return tf.sequence_mask(selection, max_output_filters, dtype=tf.float32) else: return selection else: selection_index = tf.argmax(mask) selection = tf.gather(scaled_choices, selection_index) return tf.sequence_mask(selection, max_output_filters, dtype=tf.float32)
def sequence_accuracy(gt_seqs, decode_seqs, gt_seq_lengths, pr_seq_lengths, debug=False, name=""): """Computes the complete and the partial sequence accuracy.""" gt_shape = common_layers.shape_list(gt_seqs) pr_shape = common_layers.shape_list(decode_seqs) batch_size = gt_shape[0] depth = gt_shape[-1] gt_len = gt_shape[1] pr_len = pr_shape[1] max_len = tf.maximum(gt_len, pr_len) gt_seqs = tf.pad(gt_seqs, [[0, 0], [0, max_len - gt_len], [0, 0]]) decode_seqs = tf.pad(decode_seqs, [[0, 0], [0, max_len - pr_len], [0, 0]]) gt_seqs = tf.where( tf.tile( tf.expand_dims(tf.sequence_mask(gt_seq_lengths, maxlen=max_len), 2), [1, 1, depth]), gt_seqs, tf.fill(tf.shape(gt_seqs), -1)) decode_seqs = tf.where( tf.tile( tf.expand_dims(tf.sequence_mask(pr_seq_lengths, maxlen=max_len), 2), [1, 1, depth]), decode_seqs, tf.fill(tf.shape(decode_seqs), -1)) # [batch_size, decode_length] corrects = tf.reduce_all(tf.equal(gt_seqs, decode_seqs), -1) correct_mask = tf.reduce_all(corrects, -1) # [batch_size] if debug: incorrect_mask = tf.logical_not(correct_mask) incorrect_gt = tf.boolean_mask(gt_seqs, incorrect_mask) incorrect_pr = tf.boolean_mask(decode_seqs, incorrect_mask) with tf.control_dependencies([ tf.print(name + "_mismatch", incorrect_gt, incorrect_pr, summarize=1000) ]): correct_mask = tf.identity(correct_mask) correct_seqs = tf.to_float(correct_mask) total_correct_seqs = tf.reduce_sum(correct_seqs) mean_complete_accuracy = total_correct_seqs / tf.to_float(batch_size) # Compute partial accuracy errors = tf.logical_not(corrects) errors = tf.cast(tf.cumsum(tf.to_float(errors), axis=-1), tf.bool) # [batch_size] correct_steps = tf.reduce_sum(tf.to_float(tf.logical_not(errors)), axis=-1) mean_partial_accuracy = tf.reduce_mean( tf.div(tf.minimum(correct_steps, gt_seq_lengths), gt_seq_lengths)) return mean_complete_accuracy, mean_partial_accuracy
def do_process_boundary(start_points, end_points, input_length, t1_id, t2_id, all_tokenized_diag): """function that contains the majority of the logic to proess boundary.""" masks_start = tf.sequence_mask(start_points, input_length) masks_end = tf.sequence_mask(end_points, input_length) xor_masks = tf.logical_xor(masks_start, masks_end) mask1 = tf.reduce_any(xor_masks, axis=0) mask2 = tf.logical_not(mask1) all_turn1 = tf.equal(all_tokenized_diag, t1_id) all_turn2 = tf.equal(all_tokenized_diag, t2_id) turn_point = tf.logical_or(all_turn1, all_turn2) turn_point = tf.cast(turn_point, dtype=tf.float32) return mask1, mask2, turn_point
def compute_attention_vec(input_vec, att_vecs, lengths, allow_zero_attention=False): att_vecs_shape = tf.shape(att_vecs) if allow_zero_attention: zero_vec = tf.zeros([att_vecs_shape[0], 1, att_vecs_shape[2]], dtype=DATA_TYPE) att_vecs_with_zero = tf.concat([att_vecs, zero_vec], 1) att_scores, reg_terms = get_attention_scores( input_vec, att_vecs_with_zero, model.params['num_heads'], reuse, model.params['attention_func']) # att_vecs = tf.Print(att_vecs, [att_vecs], 'this is att_scores', summarize=25) regularization_terms.extend(reg_terms) # length mask mask = tf.concat([ tf.sequence_mask( lengths, maxlen=att_vecs_shape[1], dtype=DATA_TYPE), tf.ones([att_vecs_shape[0], 1], dtype=DATA_TYPE) ], 1) masked_att_exp = tf.exp(att_scores) * mask div = tf.reduce_sum(masked_att_exp, 1, True) att_dis = masked_att_exp / tf.where(tf.less(div, 1e-7), div + 1, div) model.attention_distribution = att_dis return tf.reduce_sum( att_vecs_with_zero * tf.expand_dims(att_dis, -1), 1) else: att_scores, reg_terms = get_attention_scores( input_vec, att_vecs, model.params['num_heads'], reuse, model.params['attention_func']) # att_vecs = tf.Print(att_vecs, [att_vecs], 'this is att_scores', summarize=25) regularization_terms.extend(reg_terms) # length mask mask = tf.sequence_mask(lengths, maxlen=att_vecs_shape[1], dtype=DATA_TYPE) masked_att_exp = tf.exp(att_scores) * mask div = tf.reduce_sum(masked_att_exp, 1, True) att_dis = masked_att_exp / tf.where(tf.less(div, 1e-7), div + 1, div) ''' att_exp = tf.exp(att_scores) att_dis = att_exp/(tf.reduce_sum(att_exp, 1, True) + tf.expand_dims(lengths, -1) - model.params.max_history_length) ''' # att_dis = tf.Print(att_dis, [att_dis], 'this is att_dis', summarize=25) model.attention_distribution = att_dis return tf.reduce_sum(att_vecs * tf.expand_dims(att_dis, -1), 1)
def _compute_head_weights_with_position_prior(weights, masks, paddings, num_heads, attn_size): """Computes head-specific attention weights with position prior. This function simply masks out the weights for items if they don't belong to a certain chunk, using a sliding window technique. I.e., head i only focuses on ith recent "chunk_size" items with respect to the query. Note that chunks are non-overlapping, meaning, sliding window stride is also set to attn_size. Args: weights: A 3d tensor with shape of [h*N, T_q, T_k]. masks: A 3d tensor with shape of [h*N, T_q, T_k]. paddings: A 3d tensor with shape of [h*N, T_q, T_k]. num_heads: An integer denoting number of chunks. attn_size: An integer denoting the size of the sliding window. Returns: A list of h tensors (each shaped [N, T_q, T_k]) where tensors correspond to chunk specific weights. """ # Masks is a lower triangular tensor with ones in the bottom and zeros in the # upper section. Since chunks are allocated with respect to query position, we # first need to count the available items prior to each query. argmin function # would work for this, except the last query because it returns the smallest # index in the case of ties. To make sure we have the accurate count for the # last query, we first append a zero tensor and call the argmin function. max_idxs = tf.argmin(tf.concat([masks, tf.zeros_like(masks)], axis=-1), 2) # (h*N, T_q) # Split for heads. max_idxs_split = tf.split(max_idxs, num_heads, axis=0) # (h x (N, T_q)) weights_split = tf.split(weights, num_heads, axis=0) # (h x (N, T_q, T_k)) paddings_split = tf.split(paddings, num_heads, axis=0) # (h x (N, T_q, T_k)) # Collects output weights per chunk. chunk_outputs_list = [] for i in range(num_heads): mask_left = tf.sequence_mask( tf.maximum(max_idxs_split[i] - (attn_size * (i + 1)), 0), tf.shape(weights_split[i])[2]) # (N, T_q, T_k) mask_right = tf.sequence_mask( tf.maximum(max_idxs_split[i] - (attn_size * i), 0), tf.shape(weights_split[i])[2]) # (N, T_q, T_k) mask = tf.logical_and(tf.logical_not(mask_left), mask_right) # (N, T_q, T_k) # Adjust weights for chunk i. output = tf.where(mask, weights_split[i], paddings_split[i]) # (N, T_q, T_k) chunk_outputs_list.append(output) return chunk_outputs_list # (h x (N, T_q, T_k))
def gated_convnet(graph_inputs, batch_size=64, hidden_size=300, depth=3, res_block=2): input_atom, input_bond, atom_graph, bond_graph, num_nbs, node_mask = graph_inputs layers = [input_atom] atom_features = input_atom for i in range(depth): fatom_nei = tf.gather_nd(atom_features, atom_graph) fbond_nei = tf.gather_nd(input_bond, bond_graph) f_nei = tf.concat([fatom_nei, fbond_nei], 3) h_nei = linearND(f_nei, hidden_size, "nei_hidden_%d" % i) g_nei = tf.nn.sigmoid(linearND(f_nei, hidden_size, "nei_gate_%d" % i)) f_nei = h_nei * g_nei mask_nei = tf.reshape( tf.sequence_mask(tf.reshape(num_nbs, [-1]), max_nb, dtype=tf.float32), [batch_size, -1, max_nb, 1]) f_nei = tf.reduce_sum(f_nei * mask_nei, -2) h_self = linearND(atom_features, hidden_size, "self_hidden_%d" % i) g_self = tf.nn.sigmoid( linearND(atom_features, hidden_size, "self_gate_%d" % i)) f_self = h_self * g_self atom_features = (f_nei + f_self) * node_mask if res_block is not None and i % res_block == 0 and i > 0: atom_features = atom_features + layers[-2] layers.append(atom_features) output_gate = tf.nn.sigmoid( linearND(atom_features, hidden_size, "out_gate")) output = node_mask * (output_gate * atom_features) fp = tf.reduce_sum(output, 1) return atom_features * node_mask, fp
def mask_logits(vec, mask): """Mask `vec` in log-space. Elements in `vec` that are not in `mask` will be set to be very negative, so that when no longer in log-space (e.i., after `tf.exp(vec)`) their values will be very close to zero. Args: vec: <float32>[...] tensor to mask mask: Either a None (in which case this is a no-op), a boolean or 0/1 float mask that matches all, or all but the last, dimensions of `vec`, or 1-D integer length mask Raises: ValueError: If `mask` cannot be matched to `vec` Returns: masked: vec:<float32>[...] """ if mask is None: return vec if mask.dtype == tf.int32: # Assume `mask` holds sequence lengths if len(vec.shape) not in [2, 3]: raise ValueError("Can't use a length mask on tensor of rank>3") mask = tf.sequence_mask(mask, tf.shape(vec)[1], tf.float32) else: mask = tf.to_float(mask) if len(mask.shape) == (len(vec.shape) - 1): mask = tf.expand_dims(mask, len(vec.shape) - 1) return vec * mask - (1 - mask) * 1E20
def LSTM_layer(self, embeddings): lstm_cell = tf.nn.rnn_cell.LSTMCell(self.lstm_size) zero_state = tf.zeros(shape=(self.batch_size, self.lstm_size)) initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state) lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2])) lstm_outputs, lstm_state = tf.nn.static_rnn( cell=lstm_cell, inputs=lstm_inputs, initial_state=initial_state, sequence_length=self.sentence_lengths ) # a length-500 list of [num_docs, lstm_size] lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm=[1, 0, 2])) lstm_outputs = tf.concat( lstm_outputs, axis=0) # [num_docs * MAX_SENT_LENGTH, lstm_size] # self.mask: [num_docs * MAX_SENT_LENGTH, ] mask = tf.sequence_mask( lengths=self.sentence_lengths, maxlen=MAX_DOC_LENGTH, dtype=tf.float32) # [num_docs, MAX_SENT_LENGTH] mask = tf.concat(tf.unstack(mask, axis=0), axis=0) mask = tf.expand_dims(mask, -1) lstm_outputs = mask * lstm_outputs lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits=self.batch_size) lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis=1) # [num_docs, lstm_size] lstm_outputs_average = lstm_outputs_sum / tf.expand_dims( tf.cast( self.sentence_lengths, tf.float32), # expand_dims only work with tensor of float type -1) # [num_docs, lstm_size] return lstm_outputs_average
def call(self, hidden_states, token_ids=None, padding_token_id=None, ignore_prefix_length=None, training=None): if self._intermediate_dense is not None: intermediate_outputs = self._intermediate_dense(hidden_states) intermediate_outputs = self._intermediate_activation( intermediate_outputs) outputs = self._output_dense(intermediate_outputs) outputs = self._output_dropout(outputs, training=training) outputs = self._output_layer_norm(outputs + hidden_states) else: outputs = hidden_states logits = self._logits_dense(outputs) if token_ids is not None or padding_token_id is not None: if token_ids is None or padding_token_id is None: raise ValueError( "Both `token_ids` and `padding_token_id` needs to be " "specified in order to compute mask for logits") logits -= tf.expand_dims( tf.cast(tf.equal(token_ids, 0), tf.float32), -1) * 1e6 if ignore_prefix_length is not None: seq_length = tf.shape(logits)[-2] logits -= tf.expand_dims( tf.sequence_mask(ignore_prefix_length, seq_length, dtype=tf.float32), -1) * 1e6 return logits
def _build(self, logits, targets, target_lens, normalize_by_length=False): # pylint: disable=arguments-differ """Builds the cross entropy loss. Args: logits: <float32> [batch_size, seq_len, vocab_size] for predicted logits. targets: <int32> [batch_size, seq_len] if `sparse=True` or <float32> [batch_size, seq_len, vocab_size] otherwise. target_lens: <int32> [batch_size] for the target sequence lengths. normalize_by_length: Boolean indicating whether to normalize the loss by the sequence length (i.e., shorter sequences are penalized more). Returns: loss: <float32> [batch_size] for the loss. """ # Build weights. weights = tf.sequence_mask(target_lens, dtype=logits.dtype) # Build loss. if self._sparse: loss = self._build_sparse(logits, targets, weights, normalize_by_length) else: loss = self._build_dense(logits, targets, weights, normalize_by_length) return loss
def build_predictions_layer(self): # Assign rnn outputs. self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.out_mu, self.out_sigma = self.outputs # TODO: Sampling option. self.output_sample = self.out_mu self.input_sample = self.inputs self.output_dim = self.output_sample.shape.as_list()[-1] self.ops_evaluation['output_sample'] = self.output_sample self.ops_evaluation['p_mu'] = self.p_mu self.ops_evaluation['p_sigma'] = self.p_sigma self.ops_evaluation['q_mu'] = self.q_mu self.ops_evaluation['q_sigma'] = self.q_sigma self.ops_evaluation['state'] = self.output_state num_entries = tf.cast( self.input_seq_length.shape.as_list()[0] * tf.reduce_sum(self.input_seq_length), tf.float32) self.ops_scalar_summary["mean_out_sigma"] = tf.reduce_sum( self.out_sigma) / num_entries self.ops_scalar_summary["mean_p_sigma"] = tf.reduce_sum( self.p_sigma) / num_entries self.ops_scalar_summary["mean_q_sigma"] = tf.reduce_sum( self.q_sigma) / num_entries # Mask for precise loss calculation. self.seq_loss_mask = tf.expand_dims( tf.sequence_mask(lengths=self.input_seq_length, maxlen=tf.reduce_max(self.input_seq_length), dtype=tf.float32), -1)
def _get_categorical_slot_goals(self, features): """Obtain logits for status and values for categorical slots.""" # Predict the status of all categorical slots. slot_embeddings = features["cat_slot_emb"] status_logits = self._get_logits(slot_embeddings, 3, "categorical_slot_status") # Predict the goal value. # Shape: (batch_size, max_categorical_slots, max_categorical_values, # embedding_dim). value_embeddings = features["cat_slot_value_emb"] _, max_num_slots, max_num_values, embedding_dim = ( value_embeddings.get_shape().as_list()) value_embeddings_reshaped = tf.reshape( value_embeddings, [-1, max_num_slots * max_num_values, embedding_dim]) value_logits = self._get_logits(value_embeddings_reshaped, 1, "categorical_slot_values") # Reshape to obtain the logits for all slots. value_logits = tf.reshape(value_logits, [-1, max_num_slots, max_num_values]) # Mask out logits for padded slots and values because they will be # softmaxed. mask = tf.sequence_mask( features["cat_slot_value_num"], maxlen=max_num_values) negative_logits = -0.7 * tf.ones_like(value_logits) * value_logits.dtype.max value_logits = tf.where(mask, value_logits, negative_logits) return status_logits, value_logits
def _build_encoder(self, hparams): """Build a GNMT encoder.""" source = self.features["source"] source = tf.transpose(source) if self.length > 0: source = tf.slice(source, [0, 0], [self.length, -1]) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): emb = tf.cast( self.encoder_emb_lookup_fn(self.embedding_encoder, source), self.dtype) seq_len = self.features["source_sequence_length"] padding = tf.transpose( tf.sequence_mask(seq_len, emb.shape[0], self.dtype)) max_seq_len = tf.reduce_max(seq_len) if self.mode == contrib_learn.ModeKeys.TRAIN: emb = emb * dropout(emb.shape, emb.dtype, 1.0 - hparams.dropout) out = build_bid_rnn({"rnn": emb}, seq_len, hparams.num_units, "bidirectional_rnn") out = out * tf.expand_dims(padding, 2) for i in range(3): orig_out = out if self.mode == contrib_learn.ModeKeys.TRAIN: out = out * dropout(out.shape, emb.dtype, 1.0 - hparams.dropout) inputs = {"rnn": out} o = build_uni_rnn(inputs, max_seq_len, hparams.num_units, "rnn/uni_rnn_cell_%d" % i) if i > 0: o = o + orig_out out = o out = out * tf.expand_dims(padding, 2) return out
def rcnn_wl_only(graph_inputs, hidden_size, depth, training=True): input_atom, input_bond, atom_graph, bond_graph, num_nbs = graph_inputs atom_features = tf.nn.relu( linearND(input_atom, hidden_size, "atom_embedding", init_bias=None)) layers = [] for i in range(depth): with tf.variable_scope("WL", reuse=(i > 0)) as scope: fatom_nei = tf.gather_nd(atom_features, atom_graph) fbond_nei = tf.gather_nd(input_bond, bond_graph) mask_nei = tf.sequence_mask(tf.reshape(num_nbs, [-1]), max_nb, dtype=tf.float32) target_shape = tf.concat([tf.shape(num_nbs), [max_nb, 1]], 0) mask_nei = tf.reshape(mask_nei, target_shape) mask_nei.set_shape([None, None, max_nb, 1]) l_nei = tf.concat([fatom_nei, fbond_nei], 3) nei_label = tf.nn.relu(linearND(l_nei, hidden_size, "label_U2")) nei_label = tf.reduce_sum(nei_label * mask_nei, -2) new_label = tf.concat([atom_features, nei_label], 2) new_label = linearND(new_label, hidden_size, "label_U1") atom_features = tf.nn.relu(new_label) return atom_features
def LSTM_layer(self, embeddings): lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_size) zero_state = tf.zeros(shape=(self._batch_size, self._lstm_size)) initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state) lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2])) lstm_ouputs, last_state = tf.nn.static_rnn( cell=lstm_cell, inputs=lstm_inputs, initial_state=initial_state, sequence_length=self._sentence_lengths) lstm_ouputs = tf.unstack(tf.transpose(lstm_ouputs, perm=[1, 0, 2])) lstm_ouputs = tf.concat(lstm_ouputs, axis=0) mask = tf.sequence_mask(lengths=self._sentence_lengths, maxlen=MAX_DOC_LENGTH, dtype=tf.float32) mask = tf.concat(tf.unstack(mask, axis=0), axis=0) mask = tf.expand_dims(mask, -1) lstm_ouputs = mask * lstm_ouputs lstm_ouputs_split = tf.split(lstm_ouputs, num_or_size_splits=self._batch_size) lstm_ouputs_sum = tf.reduce_sum(lstm_ouputs_split, axis=1) lstm_ouputs_average = lstm_ouputs_sum / tf.expand_dims( tf.cast(self._sentence_lengths, tf.float32), -1) return lstm_ouputs_average
def decode_sequence(features, areas, hparams, decode_length, post_processing=True): """Decodes the entire sequence in an auto-regressive way.""" decode_utils.decode_n_step(seq2act_model.compute_logits, features, areas, hparams, n=decode_length, beam_size=1) if post_processing: features["input_refs"] = decode_utils.unify_input_ref( features["verbs"], features["input_refs"]) pred_lengths = decode_utils.verb_refs_to_lengths(features["task"], features["verb_refs"], include_eos=False) predicted_actions = tf.concat([ features["verb_refs"], features["obj_refs"], features["input_refs"], tf.to_int32(tf.expand_dims(features["verbs"], 2)), tf.to_int32(tf.expand_dims(features["objects"], 2))], axis=-1) if post_processing: predicted_actions = tf.where( tf.tile(tf.expand_dims( tf.sequence_mask(pred_lengths, maxlen=tf.shape(predicted_actions)[1]), 2), [1, 1, tf.shape(predicted_actions)[-1]]), predicted_actions, tf.zeros_like(predicted_actions)) return predicted_actions
def _compute_gradients(self, actions, discounted_rewards, weights=None, sequence_length=None, loss_str='train', use_entropy_regularization=True, **kwargs): """Implement the policy gradient in TF.""" if sequence_length is not None: seq_mask = tf.sequence_mask(sequence_length, dtype=tf.float32) else: seq_mask = None with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.trainable_variables) # Returns 0.0 if critic is not being used value_loss = self._compute_value_loss( discounted_rewards, seq_mask=seq_mask, **kwargs) policy_loss = self._compute_policy_loss( discounted_rewards, actions, seq_mask=seq_mask, weights=weights, use_entropy_regularization=use_entropy_regularization, **kwargs) loss = tf.reduce_mean(policy_loss + value_loss) if self.log_summaries and (self._counter % self.log_every == 0): contrib_summary.scalar('{}_loss'.format(loss_str), loss) return tape.gradient(loss, self.trainable_variables)
def mask_attention(attention, seq_len1, seq_len2): """Masks an attention matrix. Args: attention: <tf.float32>[batch, seq_len1, seq_len2] seq_len1: <tf.int32>[batch] seq_len2: <tf.int32>[batch] Returns: the masked scores <tf.float32>[batch, seq_len1, seq_len2] """ dim1 = tensor_utils.shape(attention, 1) dim2 = tensor_utils.shape(attention, 2) m1 = tf.sequence_mask(seq_len1, dim1) m2 = tf.sequence_mask(seq_len2, dim2) joint_mask = tf.logical_and(tf.expand_dims(m1, 2), tf.expand_dims(m2, 1)) return ops.mask_logits(attention, joint_mask)
def get_attention_bias(sequence_length): """Create attention bias so attention is not applied at padding position.""" # attention_bias: [batch, 1, 1, memory_length] invert_sequence_mask = tf.to_float(tf.logical_not(tf.sequence_mask( sequence_length))) attention_bias = common_attention.attention_bias_ignore_padding( invert_sequence_mask) return attention_bias
def reduced_by_transformer(self, is_training, num_transformer_layers=2, CLS_ID=102, use_passage_pos_embedding=False): bert_config = self.bert_config output_layer = self.output_layer model = self.model embeddings = model.get_embedding_table() # clsid_tf = tf.constant([CLS_ID], dtype=tf.int32, name="clsid_tf") clsid_tf = tf.Variable([CLS_ID], dtype=tf.int32, trainable=False, name='clsid_tf') cls_embedding = tf.nn.embedding_lookup(embeddings, clsid_tf) cls_embedding_tiled = tf.tile(cls_embedding, multiples=[self.batch_size, 1]) # [B, H] merged_output = tf.concat( (tf.expand_dims(cls_embedding_tiled, axis=1), output_layer), axis=1) # [B, N + 1, H] if use_passage_pos_embedding: with tf.variable_scope(self.scope): full_position_embeddings = tf.get_variable( name="passage_position_embedding", shape=[self.max_num_segments_perdoc + 1, self.hidden_size], initializer=self.modeling.create_initializer(0.02)) full_position_embeddings = tf.expand_dims(full_position_embeddings, axis=0) merged_output += full_position_embeddings # here comes the Transformer. attention_mask = tf.sequence_mask(self.num_segments + 1, self.max_num_segments_perdoc + 1, dtype=tf.float32) attention_mask = tf.tile(tf.expand_dims(attention_mask, axis=1), [1, self.max_num_segments_perdoc + 1, 1]) with tf.variable_scope(self.scope): with tf.variable_scope("parade_transformer"): if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 output_layer, _ = self.modeling.transformer_model( input_tensor=merged_output, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=num_transformer_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config. attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=False) # [B, N + 1, H] output_layer = tf.squeeze(output_layer[:, 0:1, :], axis=1) # [B, H] return output_layer
def get_data(self): x = tf.random_normal( (BATCH_SIZE, TARGET_LENGTH, N_CHANNELS), dtype=DTYPE) x_lengths = np.random.randint( low=1, high=TARGET_LENGTH+1, size=BATCH_SIZE) x_lengths = np.ceil(x_lengths / 4.0) * 4.0 x_lengths = x_lengths.astype(int) x_mask = tf.sequence_mask(x_lengths, maxlen=TARGET_LENGTH, dtype=DTYPE) return x, x_mask, x_lengths
def _attention_unit(self, queries, keys, keys_len): if self.use_tf_attention: query_masks = tf.cast( tf.ones_like(tf.reshape(self.user_interacted_len, [-1, 1])), dtype=tf.bool ) key_masks = tf.sequence_mask( self.user_interacted_len, self.max_seq_len ) queries = tf.expand_dims(queries, axis=1) attention = tf.keras.layers.Attention(use_scale=False) pooled_outputs = attention(inputs=[queries, keys], mask=[query_masks, key_masks]) return pooled_outputs else: # queries: B * K, keys: B * seq * K queries = tf.expand_dims(queries, axis=1) # B * seq * K queries = tf.tile(queries, [1, self.max_seq_len, 1]) queries_keys_cross = tf.concat( [queries, keys, queries - keys, queries * keys], axis=2) mlp_layer = dense_nn(queries_keys_cross, (16,), use_bn=False, activation=tf.nn.sigmoid, name="attention") # B * seq * 1 mlp_layer = tf.layers.dense(mlp_layer, units=1, activation=None) # attention_weights = tf.transpose(mlp_layer, [0, 2, 1]) attention_weights = tf.layers.flatten(mlp_layer) key_masks = tf.sequence_mask(keys_len, self.max_seq_len) paddings = tf.ones_like(attention_weights) * (-2**32 + 1) attention_scores = tf.where(key_masks, attention_weights, paddings) attention_scores = tf.div_no_nan( attention_scores, tf.sqrt( tf.cast(keys.get_shape().as_list()[-1], tf.float32) ) ) # B * 1 * seq attention_scores = tf.expand_dims( tf.nn.softmax(attention_scores), 1) # B * 1 * K pooled_outputs = attention_scores @ keys return pooled_outputs
def b2a_attention(logits, a, mask_a=None): """Context-to-query attention.""" if len(mask_a.get_shape()) == 1: mask_a = tf.sequence_mask(mask_a, tf.shape(a)[1]) if len(mask_a.get_shape()) == 2: mask_a = tf.expand_dims(mask_a, 1) logits = exp_mask(logits, mask_a, mask_is_length=False) probabilities = tf.nn.softmax(logits) # [bs,len_b,len_a] b2a = tf.matmul(probabilities, a) # [bs, len_b, d] return b2a
def build_bid_rnn(inputs, seq_len, num_units, name): """Build the bi-directional RNN.""" max_seq_len = tf.reduce_max(seq_len) fwd = build_uni_rnn(inputs, max_seq_len, num_units, name + "/fw/cell_fn/basic_lstm_cell", False) bwd_inputs = {k: inputs[k] for k in inputs} bwd_inputs["padding"] = tf.transpose( tf.sequence_mask(seq_len, inputs["rnn"].shape[0], inputs["rnn"].dtype)) bwd = build_uni_rnn(bwd_inputs, max_seq_len, num_units, name + "/bw/cell_fn/basic_lstm_cell", True) return tf.concat([fwd, bwd], -1)
def get_data(self): x = tf.random_normal((BATCH_SIZE, TARGET_LENGTH, N_CHANNELS), mean=0.0, stddev=1.0) x_lengths = np.random.randint(low=1, high=TARGET_LENGTH + 1, size=BATCH_SIZE) x_mask = tf.sequence_mask(x_lengths, maxlen=TARGET_LENGTH, dtype=tf.float32) return x, x_mask
def build_predictions_layer(self): # Assign rnn outputs. if self.use_temporal_latent_space and self.use_variational_pi: self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.gmm_z, self.q_pi, self.p_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs elif self.use_temporal_latent_space: self.q_mu, self.q_sigma, self.p_mu, self.p_sigma, self.gmm_z, self.q_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs elif self.use_variational_pi: self.gmm_z, self.q_pi, self.p_pi, self.out_mu, self.out_sigma, self.out_rho, self.out_pen, self.out_eoc = self.outputs # TODO: Sampling option. self.output_sample = tf.concat( [self.out_mu, tf.round(self.out_pen)], axis=2) self.input_sample = self.inputs self.output_dim = self.output_sample.shape.as_list()[-1] # For analysis. self.norm_p_mu = tf.norm(self.p_mu, axis=-1) self.norm_p_sigma = tf.norm(self.p_sigma, axis=-1) self.norm_q_mu = tf.norm(self.q_mu, axis=-1) self.norm_q_sigma = tf.norm(self.q_sigma, axis=-1) self.norm_out_mu = tf.norm(self.out_mu, axis=-1) self.norm_out_sigma = tf.norm(self.out_sigma, axis=-1) self.ops_evaluation['output_sample'] = self.output_sample if self.use_temporal_latent_space: self.ops_evaluation['p_mu'] = self.p_mu self.ops_evaluation['p_sigma'] = self.p_sigma self.ops_evaluation['q_mu'] = self.q_mu self.ops_evaluation['q_sigma'] = self.q_sigma if self.use_variational_pi: self.ops_evaluation['p_pi'] = tf.nn.softmax(self.p_pi, axis=-1) self.ops_evaluation['q_pi'] = tf.nn.softmax(self.q_pi, axis=-1) self.ops_evaluation['gmm_z'] = self.gmm_z self.ops_evaluation['state'] = self.output_state self.ops_evaluation['out_eoc'] = self.out_eoc # In case we want to draw samples from output distribution instead of using mean. self.ops_evaluation['out_mu'] = self.out_mu self.ops_evaluation['out_sigma'] = self.out_sigma self.ops_evaluation['out_rho'] = self.out_rho self.ops_evaluation['out_pen'] = self.out_pen # Visualize average gmm sigma values. if self.is_gmm_active: self.ops_scalar_summary["mean_gmm_sigma"] = tf.reduce_mean( self.gmm_sigma) # Sequence mask for precise loss calculation. self.seq_loss_mask = tf.expand_dims( tf.sequence_mask(lengths=self.input_seq_length, maxlen=tf.reduce_max(self.input_seq_length), dtype=tf.float32), -1)
def construct_lmcost(self, input_tensor_fw, input_tensor_bw, sentence_lengths, target_ids, lmcost_type, name): with tf.variable_scope(name): lmcost_max_vocab_size = min(len(self.word2id), self.config["lmcost_max_vocab_size"]) target_ids = tf.where( tf.greater_equal(target_ids, lmcost_max_vocab_size - 1), x=(lmcost_max_vocab_size - 1) + tf.zeros_like(target_ids), y=target_ids) cost = 0.0 if lmcost_type == "separate": lmcost_fw_mask = tf.sequence_mask( sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, 1:] lmcost_bw_mask = tf.sequence_mask( sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, :-1] lmcost_fw = self._construct_lmcost(input_tensor_fw[:, :-1, :], lmcost_max_vocab_size, lmcost_fw_mask, target_ids[:, 1:], name=name + "_fw") lmcost_bw = self._construct_lmcost(input_tensor_bw[:, 1:, :], lmcost_max_vocab_size, lmcost_bw_mask, target_ids[:, :-1], name=name + "_bw") cost += lmcost_fw + lmcost_bw elif lmcost_type == "joint": joint_input_tensor = tf.concat( [input_tensor_fw[:, :-2, :], input_tensor_bw[:, 2:, :]], axis=-1) lmcost_mask = tf.sequence_mask( sentence_lengths, maxlen=tf.shape(target_ids)[1])[:, 1:-1] cost += self._construct_lmcost(joint_input_tensor, lmcost_max_vocab_size, lmcost_mask, target_ids[:, 1:-1], name=name + "_joint") else: raise ValueError("Unknown lmcost_type: " + str(lmcost_type)) return cost