def rnn_estimator(x, y): """RNN estimator with target predictor function on top.""" x = input_op_fn(x) if cell_type == 'rnn': cell_fn = nn.rnn_cell.BasicRNNCell elif cell_type == 'gru': cell_fn = nn.rnn_cell.GRUCell elif cell_type == 'lstm': cell_fn = nn.rnn_cell.BasicLSTMCell else: raise ValueError('cell_type {} is not supported. '.format(cell_type)) if bidirectional: # forward direction cell rnn_fw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) # backward direction cell rnn_bw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) # pylint: disable=unexpected-keyword-arg, no-value-for-parameter _, encoding = bidirectional_rnn(rnn_fw_cell, rnn_bw_cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state_fw=initial_state, initial_state_bw=initial_state) else: cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) _, encoding = nn.rnn(cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state=initial_state) return target_predictor_fn(encoding, y)
def rnn_estimator(X, y): """RNN estimator with target predictor function on top.""" X = input_op_fn(X) if cell_type == 'rnn': cell_fn = nn.rnn_cell.BasicRNNCell elif cell_type == 'gru': cell_fn = nn.rnn_cell.GRUCell elif cell_type == 'lstm': cell_fn = nn.rnn_cell.BasicLSTMCell else: raise ValueError( "cell_type {} is not supported. ".format(cell_type)) if bidirectional: # forward direction cell rnn_fw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) # backward direction cell rnn_bw_cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) # pylint: disable=unexpected-keyword-arg, no-value-for-parameter _, encoding = bidirectional_rnn(rnn_fw_cell, rnn_bw_cell, X, dtype=dtypes.float32, sequence_length=sequence_length, initial_state_fw=initial_state, initial_state_bw=initial_state) else: cell = nn.rnn_cell.MultiRNNCell([cell_fn(rnn_size)] * num_layers) _, encoding = nn.rnn(cell, X, dtype=dtypes.float32, sequence_length=sequence_length, initial_state=initial_state) return target_predictor_fn(encoding, y)
def rnn_estimator(x, y): """RNN estimator with target predictor function on top.""" x = input_op_fn(x) if cell_type == 'rnn': cell_fn = nn.rnn_cell.BasicRNNCell elif cell_type == 'gru': cell_fn = nn.rnn_cell.GRUCell elif cell_type == 'lstm': cell_fn = nn.rnn_cell.BasicLSTMCell else: raise ValueError( 'cell_type {} is not supported. '.format(cell_type)) # TODO: state_is_tuple=False is deprecated if bidirectional: # forward direction cell fw_cell = cell_fn(rnn_size) bw_cell = cell_fn(rnn_size) # attach attention cells if specified if attn_length is not None: fw_cell = contrib_rnn.AttentionCellWrapper( fw_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) bw_cell = contrib_rnn.AttentionCellWrapper( fw_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers) # backward direction cell rnn_bw_cell = nn.rnn_cell.MultiRNNCell([bw_cell] * num_layers) # pylint: disable=unexpected-keyword-arg, no-value-for-parameter _, encoding = bidirectional_rnn(rnn_fw_cell, rnn_bw_cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state_fw=initial_state, initial_state_bw=initial_state) else: rnn_cell = cell_fn(rnn_size) if attn_length is not None: rnn_cell = contrib_rnn.AttentionCellWrapper( rnn_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) cell = nn.rnn_cell.MultiRNNCell([rnn_cell] * num_layers) _, encoding = nn.rnn(cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state=initial_state) return target_predictor_fn(encoding, y)
def rnn_estimator(x, y): """RNN estimator with target predictor function on top.""" x = input_op_fn(x) if cell_type == 'rnn': cell_fn = nn.rnn_cell.BasicRNNCell elif cell_type == 'gru': cell_fn = nn.rnn_cell.GRUCell elif cell_type == 'lstm': cell_fn = functools.partial( nn.rnn_cell.BasicLSTMCell, state_is_tuple=False) else: raise ValueError('cell_type {} is not supported. '.format(cell_type)) # TODO: state_is_tuple=False is deprecated if bidirectional: # forward direction cell fw_cell = cell_fn(rnn_size) bw_cell = cell_fn(rnn_size) # attach attention cells if specified if attn_length is not None: fw_cell = contrib_rnn.AttentionCellWrapper( fw_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) bw_cell = contrib_rnn.AttentionCellWrapper( fw_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers, state_is_tuple=False) # backward direction cell rnn_bw_cell = nn.rnn_cell.MultiRNNCell([bw_cell] * num_layers, state_is_tuple=False) # pylint: disable=unexpected-keyword-arg, no-value-for-parameter _, encoding = bidirectional_rnn(rnn_fw_cell, rnn_bw_cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state_fw=initial_state, initial_state_bw=initial_state) else: rnn_cell = cell_fn(rnn_size) if attn_length is not None: rnn_cell = contrib_rnn.AttentionCellWrapper( rnn_cell, attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) cell = nn.rnn_cell.MultiRNNCell([rnn_cell] * num_layers, state_is_tuple=False) _, encoding = nn.rnn(cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state=initial_state) return target_predictor_fn(encoding, y)
def rnn_seq2seq(encoder_inputs, decoder_inputs, encoder_cell, decoder_cell=None, dtype=dtypes.float32, scope=None): """RNN Sequence to Sequence model. Args: encoder_inputs: List of tensors, inputs for encoder. decoder_inputs: List of tensors, inputs for decoder. encoder_cell: RNN cell to use for encoder. decoder_cell: RNN cell to use for decoder, if None encoder_cell is used. dtype: Type to initialize encoder state with. scope: Scope to use, if None new will be produced. Returns: List of tensors for outputs and states for trianing and sampling sub-graphs. """ with vs.variable_scope(scope or "rnn_seq2seq"): _, last_enc_state = nn.rnn(encoder_cell, encoder_inputs, dtype=dtype) return rnn_decoder(decoder_inputs, last_enc_state, decoder_cell or encoder_cell)
def bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None): """Creates a bidirectional recurrent neural network. Similar to the unidirectional case (rnn) but takes input and builds independent forward and backward RNNs with the final forward and backward outputs depth-concatenated, such that the output will have the format [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. initial_state_bw: (optional) Same as for initial_state_fw. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. sequence_length: (optional) An int64 vector (tensor) of size [batch_size], containing the actual lengths for each of the sequences. scope: VariableScope for the created subgraph; defaults to "BiRNN" Returns: A pair (outputs, state) where: outputs is a length T list of outputs (one for each input), which are depth-concatenated forward and backward outputs state is the concatenated final state of the forward and backward RNN Raises: TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ if not isinstance(cell_fw, nn.rnn_cell.RNNCell): raise TypeError("cell_fw must be an instance of RNNCell") if not isinstance(cell_bw, nn.rnn_cell.RNNCell): raise TypeError("cell_bw must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") name = scope or "BiRNN" # Forward direction with vs.variable_scope(name + "_FW"): output_fw, state_fw = nn.rnn(cell_fw, inputs, initial_state_fw, dtype, sequence_length) # Backward direction with vs.variable_scope(name + "_BW"): tmp, state_bw = nn.rnn(cell_bw, _reverse_seq(inputs, sequence_length), initial_state_bw, dtype, sequence_length) output_bw = _reverse_seq(tmp, sequence_length) # Concat each of the forward/backward outputs outputs = [ array_ops_.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw) ] return outputs, array_ops_.concat(1, [state_fw, state_bw])
def bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None): """Creates a bidirectional recurrent neural network. Similar to the unidirectional case (rnn) but takes input and builds independent forward and backward RNNs with the final forward and backward outputs depth-concatenated, such that the output will have the format [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. initial_state_bw: (optional) Same as for initial_state_fw. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. sequence_length: (optional) An int64 vector (tensor) of size [batch_size], containing the actual lengths for each of the sequences. scope: VariableScope for the created subgraph; defaults to "BiRNN" Returns: A pair (outputs, state) where: outputs is a length T list of outputs (one for each input), which are depth-concatenated forward and backward outputs state is the concatenated final state of the forward and backward RNN Raises: TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ if not isinstance(cell_fw, nn.rnn_cell.RNNCell): raise TypeError('cell_fw must be an instance of RNNCell') if not isinstance(cell_bw, nn.rnn_cell.RNNCell): raise TypeError('cell_bw must be an instance of RNNCell') if not isinstance(inputs, list): raise TypeError('inputs must be a list') if not inputs: raise ValueError('inputs must not be empty') name = scope or 'BiRNN' # Forward direction with vs.variable_scope(name + '_FW'): output_fw, state_fw = nn.rnn(cell_fw, inputs, initial_state_fw, dtype, sequence_length) # Backward direction with vs.variable_scope(name + '_BW'): tmp, state_bw = nn.rnn(cell_bw, _reverse_seq(inputs, sequence_length), initial_state_bw, dtype, sequence_length) output_bw = _reverse_seq(tmp, sequence_length) # Concat each of the forward/backward outputs outputs = [array_ops_.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw)] return outputs, array_ops_.concat(1, [state_fw, state_bw])
def __init__(self, sequence_length, vocab_size, embedding_size, hidden_size, layer_count=1, **kw): assert layer_count >= 1, "An LSTM cannot have less than one layer." n_classes = kw.get('n_classes', 2) # >2 not tested. self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, n_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Layer 1: Word embeddings self.embeddings = tf.Variable( tf.random_uniform([vocab_size, embedding_size], -0.1, 0.1), name="embeddings") embedded_words = tf.nn.embedding_lookup(self.embeddings, self.input_x) # Funnel the words into the LSTM. # Current size: (batch_size, n_words, emb_dim) # Want: [(batch_size, n_hidden) * n_words] # # Since otherwise there's no way to feed information into the LSTM cell. # Yes, it's a bit confusing, because we want a batch of multiple # sequences, with each step being of 'embedding_size'. embedded_words = tf.transpose(embedded_words, [1, 0, 2]) embedded_words = tf.reshape(embedded_words, [-1, embedding_size]) # Note: 'tf.split' outputs a **Python** list. embedded_words = tf.split(0, sequence_length, embedded_words) # Layer 2: LSTM cell lstm_use_peepholes = True # 'state_is_tuple = True' should NOT be used despite the warnings # (which appear as of TF 0.9), since it doesn't work on the version of # TF installed on Euler (0.8). if layer_count > 1: print("Using deep {0}-layer LSTM with first layer size {1}" " (embedding size) and hidden layer size {2}." .format(layer_count, embedding_size, hidden_size)) print("First cell {0}->{1}".format(embedding_size, embedding_size)) first_cell = TextLSTM._cell(embedding_size, embedding_size, lstm_use_peepholes, self.dropout_keep_prob) print("Second cell {0}->{1}".format(embedding_size, hidden_size)) second_cell = TextLSTM._cell(embedding_size, hidden_size, lstm_use_peepholes, self.dropout_keep_prob) print("Third cell+ {0}->{1} (if applicable)".format(hidden_size, hidden_size)) third_plus = TextLSTM._cell(hidden_size, hidden_size, lstm_use_peepholes, self.dropout_keep_prob) deep_cells = [third_plus] * (layer_count - 2) lstm_cells = rnn_cell.MultiRNNCell([first_cell, second_cell] + deep_cells) else: print("Using simple 1-layer LSTM with hidden layer size {0}." .format(hidden_size)) lstm_cells = rnn_cell.LSTMCell(num_units=hidden_size, input_size=embedding_size, forget_bias=1.0, use_peepholes=lstm_use_peepholes) # Q: Can't batches end up containing both positive and negative labels? # Can the LSTM batch training deal with this? # # A: Yes. Each batch feeds each sentence into the LSTM, incurs the loss, # and backpropagates the error separately. Each example in a bath # is independent. Note that as opposed to language models, for # instance, where we incur a loss for all outputs, in this case we # only care about the final output of the RNN, since it doesn't make # sense to classify incomplete tweets. outputs, _states = rnn(lstm_cells, inputs=embedded_words, dtype=tf.float32) # Layer 3: Final Softmax out_weight = tf.Variable(tf.random_normal([hidden_size, n_classes])) out_bias = tf.Variable(tf.random_normal([n_classes])) with tf.name_scope("output"): lstm_final_output = outputs[-1] self.scores = tf.nn.xw_plus_b(lstm_final_output, out_weight, out_bias, name="scores") self.predictions = tf.nn.softmax(self.scores, name="predictions") with tf.name_scope("loss"): self.losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y) self.loss = tf.reduce_mean(self.losses, name="loss") with tf.name_scope("accuracy"): self.correct_pred = tf.equal(tf.argmax(self.predictions, 1), tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, "float"), name="accuracy")
def __init__(self, config, is_training=False): self.config = config self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 vocab_size = config.vocab_size self.max_grad_norm = config.max_grad_norm self.use_lstm = config.use_lstm # Placeholders for inputs. self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self.initial_state = array_ops.zeros( array_ops.pack([self.batch_size, self.num_steps]), dtype=tf.float32).set_shape([None, self.num_steps]) embedding = tf.get_variable( 'embedding', [self.config.vocab_size, self.config.hidden_size]) # Set up ACT cell and inner rnn-type cell for use inside the ACT cell. with tf.variable_scope("rnn"): if self.use_lstm: inner_cell = rnn_cell.BasicLSTMCell(self.config.hidden_size) else: inner_cell = rnn_cell.GRUCell(self.config.hidden_size) with tf.variable_scope("ACT"): act = ACTCell(self.config.hidden_size, inner_cell, config.epsilon, max_computation=config.max_computation, batch_size=self.batch_size) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.config.num_steps, inputs) ] self.outputs, final_state = rnn(act, inputs, dtype=tf.float32) # Softmax to get probability distribution over vocab. output = tf.reshape(tf.concat(1, self.outputs), [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self.logits = tf.matmul( output, softmax_w) + softmax_b # dim (numsteps*batchsize, vocabsize) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) # Add up loss and retrieve batch-normalised ponder cost: sum N + sum Remainder. ponder_cost = act.calculate_ponder_cost( time_penalty=self.config.ponder_time_penalty) self.cost = (tf.reduce_sum(loss) / batch_size) + ponder_cost self.final_state = self.outputs[-1] if is_training: self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.config.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def ACTStep(self,batch_mask,prob_compare,prob,counter,state,input,acc_outputs,acc_states): # General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs # reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state. # There is a subtlety here regarding the batch_size, as clearly we will have examples halting # at different points in the batch. This is dealt with using logical masks to protect accumulated # probabilities, states and outputs from a timestep t's contribution if they have already reached # 1-es at a timstep s < t. On the last timestep, the remainder of every example in the batch is # multiplied with the state/output, having been accumulated over the timesteps and correctly carried # through for all examples, regardless of #overall batch timesteps. # if all the probs are zero, we are seeing a new input => binary flag := 1, else 0. binary_flag = tf.cond(tf.reduce_all(tf.equal(prob,0.0)), lambda: tf.ones([self.batch_size,1],dtype=tf.float32), lambda: tf.zeros([self.batch_size,1],tf.float32)) input_with_flags = tf.concat(1, [binary_flag,input]) output, new_state = rnn(self.cell, [input_with_flags], state, scope=type(self.cell).__name__) with tf.variable_scope('sigmoid_activation_for_pondering'): p = tf.squeeze(tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True))) # multiply by the previous mask as if we stopped before, we don't want to start again # if we generate a p less than p_t-1 for a given example. new_batch_mask = tf.logical_and(tf.less(prob + p,self.one_minus_eps),batch_mask) new_float_mask = tf.cast(new_batch_mask, tf.float32) # only increase the prob accumulator for the examples # which haven't already passed the threshold. This # means that we can just use the final prob value per # example to determine the remainder. prob += p * new_float_mask # this accumulator is used solely in the While loop condition. # we multiply by the PREVIOUS batch mask, to capture probabilities # that have gone over 1-eps THIS iteration. prob_compare += p * tf.cast(batch_mask, tf.float32) def use_remainder(): # runs on the last iteration of while loop. prob now contains # exactly the probability at N-1, ie the timestep before we # go over 1-eps for all elements of the batch. remainder = tf.constant(1.0, tf.float32,[self.batch_size]) - prob remainder_expanded = tf.expand_dims(remainder,1) tiled_remainder = tf.tile(remainder_expanded,[1,self.output_size]) acc_state = (new_state * tiled_remainder) + acc_states acc_output = (output[0] * tiled_remainder) + acc_outputs return acc_state, acc_output def normal(): # accumulate normally, by multiplying the batch # probs with the output and state of the rnn. # If we passed the 1-eps threshold this round, we # have a zero in the batch mask, so we add no contribution # to acc_state or acc_output p_expanded = tf.expand_dims(p * new_float_mask,1) tiled_p = tf.tile(p_expanded,[1,self.output_size]) acc_state = (new_state * tiled_p) + acc_states acc_output = (output[0] * tiled_p) + acc_outputs return acc_state, acc_output # only increase the counter for those probabilities that # did not go over 1-eps in this iteration. counter += tf.constant(1.0,tf.float32,[self.batch_size]) * new_float_mask # halting condition(halts, and uses the remainder when this is FALSE): # if the batch mask is all zeros, then all batches have finished. # if any batch element still has both a prob < 1-eps AND counter< N we continue. counter_condition = tf.less(counter,self.N) condition = tf.reduce_any(tf.logical_and(new_batch_mask,counter_condition)) acc_state, acc_output = tf.cond(condition, normal, use_remainder) return [new_batch_mask,prob_compare,prob,counter,new_state, input, acc_output,acc_state]
def ACTStep(self, batch_mask, prob_compare, prob, counter, state, input, acc_outputs, acc_states): # General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs # reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state. # There is a subtlety here regarding the batch_size, as clearly we will have examples halting # at different points in the batch. This is dealt with using logical masks to protect accumulated # probabilities, states and outputs from a timestep t's contribution if they have already reached # 1-es at a timstep s < t. On the last timestep, the remainder of every example in the batch is # multiplied with the state/output, having been accumulated over the timesteps and correctly carried # through for all examples, regardless of #overall batch timesteps. # if all the probs are zero, we are seeing a new input => binary flag := 1, else 0. binary_flag = tf.cond( tf.reduce_all(tf.equal(prob, 0.0)), lambda: tf.ones([self.batch_size, 1], dtype=tf.float32), lambda: tf.zeros([self.batch_size, 1], tf.float32)) input_with_flags = tf.concat(1, [binary_flag, input]) output, new_state = rnn(self.cell, [input_with_flags], state, scope=type(self.cell).__name__) with tf.variable_scope('sigmoid_activation_for_pondering'): p = tf.squeeze( tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True))) # multiply by the previous mask as if we stopped before, we don't want to start again # if we generate a p less than p_t-1 for a given example. new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps), batch_mask) new_float_mask = tf.cast(new_batch_mask, tf.float32) # only increase the prob accumulator for the examples # which haven't already passed the threshold. This # means that we can just use the final prob value per # example to determine the remainder. prob += p * new_float_mask # this accumulator is used solely in the While loop condition. # we multiply by the PREVIOUS batch mask, to capture probabilities # that have gone over 1-eps THIS iteration. prob_compare += p * tf.cast(batch_mask, tf.float32) def use_remainder(): # runs on the last iteration of while loop. prob now contains # exactly the probability at N-1, ie the timestep before we # go over 1-eps for all elements of the batch. remainder = tf.constant(1.0, tf.float32, [self.batch_size]) - prob remainder_expanded = tf.expand_dims(remainder, 1) tiled_remainder = tf.tile(remainder_expanded, [1, self.output_size]) acc_state = (new_state * tiled_remainder) + acc_states acc_output = (output[0] * tiled_remainder) + acc_outputs return acc_state, acc_output def normal(): # accumulate normally, by multiplying the batch # probs with the output and state of the rnn. # If we passed the 1-eps threshold this round, we # have a zero in the batch mask, so we add no contribution # to acc_state or acc_output p_expanded = tf.expand_dims(p * new_float_mask, 1) tiled_p = tf.tile(p_expanded, [1, self.output_size]) acc_state = (new_state * tiled_p) + acc_states acc_output = (output[0] * tiled_p) + acc_outputs return acc_state, acc_output # only increase the counter for those probabilities that # did not go over 1-eps in this iteration. counter += tf.constant(1.0, tf.float32, [self.batch_size]) * new_float_mask # halting condition(halts, and uses the remainder when this is FALSE): # if the batch mask is all zeros, then all batches have finished. # if any batch element still has both a prob < 1-eps AND counter< N we continue. counter_condition = tf.less(counter, self.N) condition = tf.reduce_any( tf.logical_and(new_batch_mask, counter_condition)) acc_state, acc_output = tf.cond(condition, normal, use_remainder) return [ new_batch_mask, prob_compare, prob, counter, new_state, input, acc_output, acc_state ]
def ACTStep(self, batch_mask, prob_compare, prob, counter, state, input, acc_outputs, acc_states): #TODO: leavesbreathe, implement batch norming hidden state? output, new_state = rnn(self.cell, [input], state, scope=type(self.cell).__name__) if self.use_lstm: input_for_eval_halting_p, _ = tf.split(1, 2, new_state) else: input_for_eval_halting_p = new_state p = self.CalculateHaltingProbability(input_for_eval_halting_p) # here we create a mask on the p vector, which we then multiply with the state/output. # if p[i] = 0, then we have passed the remainder point for that example, so we multiply # the state/output vector by this masked probability(which has zeros if the prob for # a batch has passed the stopping point) so we carry none of it forward. # If, by adding p, we pass the boundary, we don't add p onto prob - this allows us to # use the use_remainder() as normal for all steps after ALL examples have taken their max time. # multiply by the previous mask as if we stopped before, we don't want to start again new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps), batch_mask) float_mask = tf.cast(new_batch_mask, tf.float32) # only increase the prob accumulator for the examples # which haven't already passed the threshold. This # means that we can just use the final prob value per # example to determine the remainder. prob += p * float_mask prob_compare += p * tf.cast(batch_mask, tf.float32) def use_remainder(): remainder = tf.constant(1.0, tf.float32, [self.batch_size]) - prob remainder_expanded = tf.expand_dims(remainder, 1) #leavesbreathe commented out the tiling below for lstm implementation # tiled_remainder = tf.tile(remainder_expanded,[1,self.output_size]) acc_state = tf.add(tf.mul(new_state, remainder_expanded), acc_states) acc_output = tf.add(tf.mul(output[0], remainder_expanded), acc_outputs) return acc_state, acc_output def normal(): p_expanded = tf.expand_dims(p * float_mask, 1) # tiled_p = tf.tile(p_expanded,[1,self.output_size]) acc_state = tf.add(tf.mul(new_state, p_expanded), acc_states) acc_output = tf.mul(output[0], p_expanded) + acc_outputs return acc_state, acc_output # halting condition: if the batch mask is all zeros, then all batches have finished. # therefore, if the sum of the mask = 0, then we use the remainder. counter += tf.constant(1.0, tf.float32, [self.batch_size]) * float_mask counter_condition = tf.less(counter, self.N) condition = tf.reduce_any( tf.logical_and(new_batch_mask, counter_condition)) acc_state, acc_output = tf.cond(condition, normal, use_remainder) # only increment the counter for the examples which are still running # counter += tf.constant(1.0,tf.float32,[self.batch_size]) return [ new_batch_mask, prob_compare, prob, counter, new_state, input, acc_output, acc_state ]
def __init__(self, sequence_length, vocab_size, embedding_size, hidden_size, layer_count=1, **kw): assert layer_count >= 1, "An LSTM cannot have less than one layer." n_classes = kw.get('n_classes', 2) # >2 not tested. self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, n_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Layer 1: Word embeddings self.embeddings = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -0.1, 0.1), name="embeddings") embedded_words = tf.nn.embedding_lookup(self.embeddings, self.input_x) # Funnel the words into the LSTM. # Current size: (batch_size, n_words, emb_dim) # Want: [(batch_size, n_hidden) * n_words] # # Since otherwise there's no way to feed information into the LSTM cell. # Yes, it's a bit confusing, because we want a batch of multiple # sequences, with each step being of 'embedding_size'. embedded_words = tf.transpose(embedded_words, [1, 0, 2]) embedded_words = tf.reshape(embedded_words, [-1, embedding_size]) # Note: 'tf.split' outputs a **Python** list. embedded_words = tf.split(0, sequence_length, embedded_words) # Layer 2: LSTM cell lstm_use_peepholes = True # 'state_is_tuple = True' should NOT be used despite the warnings # (which appear as of TF 0.9), since it doesn't work on the version of # TF installed on Euler (0.8). if layer_count > 1: print("Using deep {0}-layer LSTM with first layer size {1}" " (embedding size) and hidden layer size {2}.".format( layer_count, embedding_size, hidden_size)) print("First cell {0}->{1}".format(embedding_size, embedding_size)) first_cell = TextLSTM._cell(embedding_size, embedding_size, lstm_use_peepholes, self.dropout_keep_prob) print("Second cell {0}->{1}".format(embedding_size, hidden_size)) second_cell = TextLSTM._cell(embedding_size, hidden_size, lstm_use_peepholes, self.dropout_keep_prob) print("Third cell+ {0}->{1} (if applicable)".format( hidden_size, hidden_size)) third_plus = TextLSTM._cell(hidden_size, hidden_size, lstm_use_peepholes, self.dropout_keep_prob) deep_cells = [third_plus] * (layer_count - 2) lstm_cells = rnn_cell.MultiRNNCell([first_cell, second_cell] + deep_cells) else: print( "Using simple 1-layer LSTM with hidden layer size {0}.".format( hidden_size)) lstm_cells = rnn_cell.LSTMCell(num_units=hidden_size, input_size=embedding_size, forget_bias=1.0, use_peepholes=lstm_use_peepholes) # Q: Can't batches end up containing both positive and negative labels? # Can the LSTM batch training deal with this? # # A: Yes. Each batch feeds each sentence into the LSTM, incurs the loss, # and backpropagates the error separately. Each example in a bath # is independent. Note that as opposed to language models, for # instance, where we incur a loss for all outputs, in this case we # only care about the final output of the RNN, since it doesn't make # sense to classify incomplete tweets. outputs, _states = rnn(lstm_cells, inputs=embedded_words, dtype=tf.float32) # Layer 3: Final Softmax out_weight = tf.Variable(tf.random_normal([hidden_size, n_classes])) out_bias = tf.Variable(tf.random_normal([n_classes])) with tf.name_scope("output"): lstm_final_output = outputs[-1] self.scores = tf.nn.xw_plus_b(lstm_final_output, out_weight, out_bias, name="scores") self.predictions = tf.nn.softmax(self.scores, name="predictions") with tf.name_scope("loss"): self.losses = tf.nn.softmax_cross_entropy_with_logits( self.scores, self.input_y) self.loss = tf.reduce_mean(self.losses, name="loss") with tf.name_scope("accuracy"): self.correct_pred = tf.equal(tf.argmax(self.predictions, 1), tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, "float"), name="accuracy")
def act_step(self, batch_mask, prob_compare, prob, counter, state, input, acc_outputs, acc_states): ''' General idea: generate halting probabilites and accumulate them. Stop when the accumulated probs reach a halting value, 1-eps. At each timestep, multiply the prob with the rnn output/state. There is a subtlety here regarding the batch_size, as clearly we will have examples halting at different points in the batch. This is dealt with using logical masks to protect accumulated probabilities, states and outputs from a timestep t's contribution if they have already reached 1 - es at a timstep s < t. On the last timestep for each element in the batch the remainder is multiplied with the state/output, having been accumulated over the timesteps, as this takes into account the epsilon value. ''' # If all the probs are zero, we are seeing a new input => binary flag := 1, else 0. binary_flag = tf.cond( tf.reduce_all(tf.equal(prob, 0.0)), lambda: tf.ones([self.batch_size, 1], dtype=tf.float32), lambda: tf.zeros([self.batch_size, 1], tf.float32)) input_with_flags = tf.concat(1, [binary_flag, input]) output, new_state = rnn(self.cell, [input_with_flags], state, scope=type(self.cell).__name__) with tf.variable_scope('sigmoid_activation_for_pondering'): p = tf.squeeze( tf.sigmoid(tf.nn.rnn_cell._linear(new_state, 1, True))) # Multiply by the previous mask as if we stopped before, we don't want to start again # if we generate a p less than p_t-1 for a given example. new_batch_mask = tf.logical_and(tf.less(prob + p, self.one_minus_eps), batch_mask) new_float_mask = tf.cast(new_batch_mask, tf.float32) # Only increase the prob accumulator for the examples # which haven't already passed the threshold. This # means that we can just use the final prob value per # example to determine the remainder. prob += p * new_float_mask # This accumulator is used solely in the While loop condition. # we multiply by the PREVIOUS batch mask, to capture probabilities # that have gone over 1-eps THIS iteration. prob_compare += p * tf.cast(batch_mask, tf.float32) # Only increase the counter for those probabilities that # did not go over 1-eps in this iteration. counter += new_float_mask # Halting condition (halts, and uses the remainder when this is FALSE): # If any batch element still has both a prob < 1 - epsilon AND counter < N we # continue, using the outputed probability p. counter_condition = tf.less(counter, self.N) final_iteration_condition = tf.logical_and(new_batch_mask, counter_condition) use_remainder = tf.expand_dims(1.0 - prob, -1) use_probability = tf.expand_dims(p, -1) update_weight = tf.select(final_iteration_condition, use_probability, use_remainder) float_mask = tf.expand_dims(tf.cast(batch_mask, tf.float32), -1) acc_state = (new_state * update_weight * float_mask) + acc_states acc_output = (output[0] * update_weight * float_mask) + acc_outputs return [ new_batch_mask, prob_compare, prob, counter, new_state, input, acc_output, acc_state ]