def cudnn_bidirectional_lstm(cells_fw, cells_bw, inputs, length, is_training): """Implements stacked bidirectional LSTM for variable-length inputs.""" inputs_fw = tf.transpose(inputs, [1, 0, 2]) for lstm_fw, lstm_bw in zip(cells_fw, cells_bw): outputs_fw, _ = lstm_fw(inputs_fw, training=is_training) inputs_bw = tf.reverse_sequence(inputs_fw, length, seq_axis=0, batch_axis=1) outputs_bw, _ = lstm_bw(inputs_bw, training=is_training) outputs_bw = tf.reverse_sequence(outputs_bw, length, seq_axis=0, batch_axis=1) inputs_fw = tf.concat([outputs_fw, outputs_bw], axis=2) return outputs_fw, outputs_bw
def lstm_seq2seq_internal(inputs, targets, hparams, train): """The basic LSTM seq2seq model, main step used for training.""" with tf.variable_scope("lstm_seq2seq"): if inputs is not None: inputs_length = common_layers.length_from_embedding(inputs) # Flatten inputs. inputs = common_layers.flatten4d3d(inputs) # LSTM encoder. inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1) _, final_encoder_state = lstm(inputs, inputs_length, hparams, train, "encoder") else: final_encoder_state = None # LSTM decoder. shifted_targets = common_layers.shift_right(targets) # Add 1 to account for the padding added to the left from shift_right targets_length = common_layers.length_from_embedding( shifted_targets) + 1 decoder_outputs, _ = lstm(common_layers.flatten4d3d(shifted_targets), targets_length, hparams, train, "decoder", initial_state=final_encoder_state) return tf.expand_dims(decoder_outputs, axis=2)
def _single_lstm(input_emb, input_len, hidden_size, is_fwd, use_cudnn): """Compute the outputs of a single LSTM (subroutine of stacked_bilstm). Be careful if used anywhere outside of stacked_bilstm, which converts the sequences to the time-major format expected by this function. Args: input_emb: <float32> [sequence_length, batch_size, emb] input_len: <int32> [batch_size] hidden_size: Number of units in the LSTM cell. is_fwd: Boolean indicator the directionality of the LSTM. use_cudnn: Boolean indicating the use of cudnn. Returns: output_emb: <float32> [sequence_length, batch_size, emb] """ if not is_fwd: input_emb = tf.reverse_sequence( input_emb, input_len, seq_axis=0, batch_axis=1) if use_cudnn: lstm = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=hidden_size, input_mode=cudnn_rnn_ops.CUDNN_INPUT_LINEAR_MODE, direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION) lstm.build(input_emb.shape) output_emb, _ = lstm(input_emb) else: cell = contrib_cudnn_rnn.CudnnCompatibleLSTMCell(hidden_size) cell = contrib_rnn.MultiRNNCell([cell]) output_emb, _ = tf.nn.dynamic_rnn( cell=cell, inputs=input_emb, sequence_length=input_len, dtype=tf.float32, time_major=True) if not is_fwd: output_emb = tf.reverse_sequence( output_emb, input_len, seq_axis=0, batch_axis=1) return output_emb
def build_graph(parameters): """Build the graph for reverse_sequence tests.""" input_value = tf.compat.v1.placeholder(dtype=parameters["input_dtype"], name="input", shape=parameters["input_shape"]) outs = tf.reverse_sequence(input_value, seq_lengths=parameters["seq_lengths"], batch_axis=parameters["batch_axis"], seq_axis=parameters["seq_axis"]) return [input_value], [outs]
def birnn(cell, inputs, sequence_length, initial_state_fw=None, initial_state_bw=None, ff_keep_prob=1., recur_keep_prob=1., enforce_dropout=False, dtype=tf.float32, scope=None): """ """ # Forward direction with tf.variable_scope(scope or 'BiRNN_FW') as fw_scope: output_fw, output_state_fw = rnn(cell, inputs, sequence_length, initial_state_fw, ff_keep_prob, recur_keep_prob, enforce_dropout, dtype, scope=fw_scope) # Backward direction rev_inputs = tf.reverse_sequence(inputs, sequence_length, 1, 0) with tf.variable_scope(scope or 'BiRNN_BW') as bw_scope: output_bw, output_state_bw = rnn(cell, rev_inputs, sequence_length, initial_state_bw, ff_keep_prob, recur_keep_prob, enforce_dropout, dtype, scope=bw_scope) output_bw = tf.reverse_sequence(output_bw, sequence_length, 1, 0) # Concat each of the forward/backward outputs outputs = tf.concat([output_fw, output_bw], 2) return outputs, tf.tuple([output_state_fw, output_state_bw])
def testScanSumEquivalenceWithSeqLen(self): with self.test_session() as sess: sequence_lengths = [0, 2] bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32) sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]] eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]] eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]] eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]] eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) eq_sequence_in = tf.transpose( tf.constant(eq_sequence, dtype=tf.float32)) eq_decays_in = tf.transpose( tf.constant(eq_decays, dtype=tf.float32)) eq_reverse_sequence_in = tf.transpose( tf.constant(eq_reverse_sequence, dtype=tf.float32)) eq_reverse_decays_in = tf.transpose( tf.constant(eq_reverse_decays, dtype=tf.float32)) eq_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=False, sequence_lengths=sequence_lengths) exp_eq_result = sequence_ops.scan_discounted_sum( eq_sequence_in, eq_decays_in, bootstrap) eq_reverse_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) exp_eq_reverse_result = sequence_ops.scan_discounted_sum( eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap) exp_eq_reverse_result = tf.reverse_sequence(exp_eq_reverse_result, sequence_lengths, seq_axis=0, batch_axis=1) self.assertAllClose(sess.run(eq_result), sess.run(exp_eq_result)) self.assertAllClose(sess.run(eq_reverse_result), sess.run(exp_eq_reverse_result))
def body(self, features): if self._hparams.initializer == "orthogonal": raise ValueError("LSTM models fail with orthogonal initializer.") train = self._hparams.mode == tf_estimator.ModeKeys.TRAIN inputs = features.get("inputs") inputs_length = common_layers.length_from_embedding(inputs) # Flatten inputs. inputs = common_layers.flatten4d3d(inputs) # LSTM encoder. inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1) encoder_output, _ = lstm(inputs, inputs_length, self._hparams, train, "encoder") return tf.expand_dims(encoder_output, axis=2)
def _reverse(self, t, lengths): """Time reverse the provided tensor or list of tensors. Assumes the top dimension is the time dimension. Args: t: 3D tensor or list of 2D tensors to be reversed lengths: 1D tensor of lengths, or `None` Returns: A reversed tensor or list of tensors """ if isinstance(t, list): return list(reversed(t)) else: if lengths is None: return tf.reverse(t, [0]) else: return tf.reverse_sequence(t, lengths, 0, 1)
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train, inputs_length, targets_length): """LSTM seq2seq model with attention, main step used for training.""" with tf.variable_scope("lstm_seq2seq_attention"): # Flatten inputs. inputs = common_layers.flatten4d3d(inputs) # LSTM encoder. inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1) encoder_outputs, final_encoder_state = lstm(inputs, inputs_length, hparams, train, "encoder") # LSTM decoder with attention. shifted_targets = common_layers.shift_right(targets) # Add 1 to account for the padding added to the left from shift_right targets_length = targets_length + 1 decoder_outputs = lstm_attention_decoder( common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder", final_encoder_state, encoder_outputs, inputs_length, targets_length) return tf.expand_dims(decoder_outputs, axis=2)
def _reverse_seq(sequence, sequence_lengths=None): """Reverse sequence along dim 0. Args: sequence: Tensor of shape [T, B, ...]. sequence_lengths: (optional) tensor of shape [B]. If `None`, only reverse along dim 0. Returns: Tensor of same shape as sequence with dim 0 reversed up to sequence_lengths. """ if sequence_lengths is None: return tf.reverse(sequence, [0]) sequence_lengths = tf.convert_to_tensor(sequence_lengths) with tf.control_dependencies( [tf.assert_equal(sequence.shape[1], sequence_lengths.shape[0])]): return tf.reverse_sequence(sequence, sequence_lengths, seq_axis=0, batch_axis=1)
def cudnn_lstm_layer(inputs, batch_size, num_units, lengths=None, stack_size=1, rnn_dropout_drop_amt=0, is_training=True, bidirectional=True): """Create a LSTM layer that uses cudnn.""" inputs_t = tf.transpose(inputs, [1, 0, 2]) if lengths is not None: all_outputs = [inputs_t] for i in range(stack_size): with tf.variable_scope('stack_' + str(i)): with tf.variable_scope('forward'): lstm_fw = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_fw = tf.zeros([1, batch_size, num_units], tf.float32) h_fw = tf.zeros([1, batch_size, num_units], tf.float32) outputs_fw, _ = lstm_fw(all_outputs[-1], (h_fw, c_fw), training=is_training) combined_outputs = outputs_fw if bidirectional: with tf.variable_scope('backward'): lstm_bw = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_bw = tf.zeros([1, batch_size, num_units], tf.float32) h_bw = tf.zeros([1, batch_size, num_units], tf.float32) inputs_reversed = tf.reverse_sequence(all_outputs[-1], lengths, seq_axis=0, batch_axis=1) outputs_bw, _ = lstm_bw(inputs_reversed, (h_bw, c_bw), training=is_training) outputs_bw = tf.reverse_sequence(outputs_bw, lengths, seq_axis=0, batch_axis=1) combined_outputs = tf.concat([outputs_fw, outputs_bw], axis=2) all_outputs.append(combined_outputs) # for consistency with cudnn, here we just return the top of the stack, # although this can easily be altered to do other things, including be # more resnet like return tf.transpose(all_outputs[-1], [1, 0, 2]) else: lstm = contrib_cudnn_rnn.CudnnLSTM( num_layers=stack_size, num_units=num_units, direction='bidirectional' if bidirectional else 'unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=contrib_layers.variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) stack_multiplier = 2 if bidirectional else 1 c = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) h = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) outputs, _ = lstm(inputs_t, (h, c), training=is_training) outputs = tf.transpose(outputs, [1, 0, 2]) return outputs
def _build_lstms(self): # now the LSTMs # these will collect the initial states for the forward # (and reverse LSTMs if we are doing bidirectional) # parse the options lstm_dim = self.options['lstm']['dim'] projection_dim = self.options['lstm']['projection_dim'] n_lstm_layers = self.options['lstm'].get('n_layers', 1) cell_clip = self.options['lstm'].get('cell_clip') proj_clip = self.options['lstm'].get('proj_clip') use_skip_connections = self.options['lstm']['use_skip_connections'] # if use_skip_connections: # print("USING SKIP CONNECTIONS", file=sys.stderr) # else: # print("NOT USING SKIP CONNECTIONS", file=sys.stderr) # the sequence lengths from input mask if self.use_character_inputs: mask = tf.reduce_any(self.ids_placeholder > 0, axis=2) else: mask = self.ids_placeholder > 0 sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1) batch_size = tf.shape(sequence_lengths)[0] # for each direction, we'll store tensors for each layer self.lstm_outputs = {'forward': [], 'backward': []} self.lstm_state_sizes = {'forward': [], 'backward': []} self.lstm_init_states = {'forward': [], 'backward': []} self.lstm_final_states = {'forward': [], 'backward': []} update_ops = [] for direction in ['forward', 'backward']: if direction == 'forward': layer_input = self.embedding else: layer_input = tf.reverse_sequence(self.embedding, sequence_lengths, seq_axis=1, batch_axis=0) for i in range(n_lstm_layers): if projection_dim < lstm_dim: # are projecting down output lstm_cell = tf.nn.rnn_cell.LSTMCell( lstm_dim, num_proj=projection_dim, cell_clip=cell_clip, proj_clip=proj_clip) else: lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, cell_clip=cell_clip, proj_clip=proj_clip) if use_skip_connections: # ResidualWrapper adds inputs to outputs if i == 0: # don't add skip connection from token embedding to # 1st layer output pass else: # add a skip connection lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell) # collect the input state, run the dynamic rnn, collect # the output state_size = lstm_cell.state_size # the LSTMs are stateful. To support multiple batch sizes, # we'll allocate size for states up to max_batch_size, # then use the first batch_size entries for each batch init_states = [ tf.Variable(tf.zeros([self._max_batch_size, dim]), trainable=False) for dim in state_size ] batch_init_states = [ state[:batch_size, :] for state in init_states ] if direction == 'forward': i_direction = 0 else: i_direction = 1 f_string = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}' variable_scope_name = f_string.format(i_direction, i) with tf.variable_scope(variable_scope_name): layer_output, final_state = tf.nn.dynamic_rnn( lstm_cell, layer_input, sequence_length=sequence_lengths, initial_state=tf.nn.rnn_cell.LSTMStateTuple( *batch_init_states), ) self.lstm_state_sizes[direction].append(state_size) self.lstm_init_states[direction].append(init_states) self.lstm_final_states[direction].append(final_state) if direction == 'forward': self.lstm_outputs[direction].append(layer_output) else: self.lstm_outputs[direction].append( tf.reverse_sequence(layer_output, sequence_lengths, seq_axis=1, batch_axis=0)) with tf.control_dependencies([layer_output]): # update the initial states for in_st in range(2): new_state = tf.concat([ final_state[in_st][:batch_size, :], init_states[in_st][batch_size:, :] ], axis=0) state_update_op = tf.assign(init_states[in_st], new_state) update_ops.append(state_update_op) layer_input = layer_output self.mask = mask self.sequence_lengths = sequence_lengths self.update_state_op = tf.group(*update_ops)
def _build_ops(lm_graph): with tf.control_dependencies([lm_graph.update_state_op]): # get the LM embeddings token_embeddings = lm_graph.embedding layers = [tf.concat([token_embeddings, token_embeddings], axis=2)] n_lm_layers = len(lm_graph.lstm_outputs['forward']) for i in range(n_lm_layers): layers.append( tf.concat([ lm_graph.lstm_outputs['forward'][i], lm_graph.lstm_outputs['backward'][i] ], axis=-1)) # The layers include the BOS/EOS tokens. Remove them sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2 layers_without_bos_eos = [] for layer in layers: layer_wo_bos_eos = layer[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) layers_without_bos_eos.append(layer_wo_bos_eos) # concatenate the layers lm_embeddings = tf.concat( [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos], axis=1) # get the mask op without bos/eos. # tf doesn't support reversing boolean tensors, so cast # to int then back mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32') mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = mask_wo_bos_eos[:, 1:] mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool') return { 'lm_embeddings': lm_embeddings, 'lengths': sequence_length_wo_bos_eos, 'token_embeddings': lm_graph.embedding, 'mask': mask_wo_bos_eos, }