def test(self, session, input_batch, pose_batch, initial_states=None): '''Get network predictions for some input sequence and compute average error. Parameters ---------- session : tf.Session Session to run ops in input_batch : np.ndarray Array of shape ``(batch_size, sequence_length, h, w, 6)`` where two consecutive rgb images are stacked together. pose_batch : np.ndarray Array of shape ``(batch_size, sequence_length, 6)`` with Poses ''' batch_size = input_batch.shape[0] if initial_states is None: initial_states = tensor_from_lstm_tuple( self.get_zero_state(session, batch_size)) fetches = [*self.predictions, self.loss, self.rnn_state] y_t, y_r, loss, states = session.run(fetches, feed_dict={ self.batch_size: batch_size, self.target_poses: pose_batch, self.input_images: input_batch, self.lstm_states: initial_states }) return y_t, y_r, loss, states
def train(self, session, input_batch, pose_batch, initial_states=None, return_prediction=False): '''Train the network. Parameters ---------- session : tf.Session Session to execute op in input_batch : np.ndarray Array of shape ``(batch_size, sequence_length, h, w, 6)`` where two consecutive rgb images are stacked together. pose_batch : np.ndarray Array of shape ``(batch_size, sequence_length, 6)`` with Poses initial_states : np.ndarray Array of shape ``(2, 2, batch_size, memory_size)`` Returns ------- tuple(np.ndarray) Outputs of the ``train_step``, ``loss``, and ``rnn_state`` operations, and optionally the predictions for r and t at the front ''' batch_size = input_batch.shape[0] if initial_states is None: zero_state = self.get_zero_state(session, batch_size) initial_states = tensor_from_lstm_tuple(zero_state) if return_prediction: fetches = [ self.y_t, self.y_r, self.train_step, self.loss, self.rnn_state ] else: fetches = [self.train_step, self.loss, self.rnn_state] return session.run(fetches, feed_dict={ self.batch_size: batch_size, self.input_images: input_batch, self.target_poses: pose_batch, self.lstm_states: initial_states })
def __init__(self, image_shape, memory_size, sequence_length, optimizer_spec=None, resize_images=False, use_dropout=True, use_flownet=False): ''' Parameters ---------- image_shape : tuple memory_size : int LSTM state size (identical for both layers) sequence_length : int Length of the video stream optimizer_spec : OptimizerSpec Specification of the optimizer resize_images : bool Rezise images to a multiple of 64 use_dropout : bool Do not use dropout for LSTM cells use_flownet : bool Name CNN vars according to flownet naming scheme. You *must* call :py:meth:`load_flownet` before pushing stuff through the graph. ''' if not optimizer_spec: optimizer_spec = OptimizerSpec(kind='Adagrad', learning_rate=0.001) optimizer = optimizer_spec.create() self.use_dropout = use_dropout self.use_flownet = use_flownet self.sequence_length = sequence_length ############################################################################################ # Inputs # ############################################################################################ with tf.variable_scope('inputs'): h, w, c = image_shape self.input_images = tf.placeholder( tf.float32, shape=[None, sequence_length, h, w, 2 * c], name='imgs') if resize_images: self.input_images = resize_to_multiple(self.images, 64) self.target_poses = tf.placeholder( tf.float32, shape=[None, sequence_length, 6], name='target_poses') # this placeholder is used for feeding both the cell and hidden states of both lstm # cells. The cell state comes before the hidden state N_lstm = 2 self.lstm_states = tf.placeholder(tf.float32, shape=(N_lstm, 2, None, memory_size), name='LSTM_states') self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size') ############################################################################################ # Convolutions # ############################################################################################ ksizes = [7, 5, 5, 3, 3, 3, 3, 3, 3] strides = [2, 2, 2, 1, 2, 1, 2, 1, 2] n_channels = [64, 128, 256, 256, 512, 512, 512, 512, 1024] self.cnn_activations = [] # we call cnn() in a loop, but the variables will be reused after first creation for idx in range(sequence_length): stacked_image = self.input_images[:, idx, :] cnn_activation = self.cnn(stacked_image, ksizes, strides, n_channels, reuse=tf.AUTO_REUSE) self.cnn_activations.append(cnn_activation) # compute number of activations for flattening the conv output def num_activations(conv): return np.prod(conv.shape[1:].as_list()) # flatten cnn output for each batch element rnn_inputs = [ tf.reshape( conv, [self.batch_size, num_activations(conv)]) for conv in self.cnn_activations ] ############################################################################################ # LSTM # ############################################################################################ with tf.variable_scope('rnn'): '''Create all recurrent layers as specified in the paper.''' lstm0 = LSTMCell(memory_size, state_is_tuple=True) lstm1 = LSTMCell(memory_size, state_is_tuple=True) if self.use_dropout: lstm_keep_probs = [0.7, 0.8] lstm0 = DropoutWrapper(lstm0, output_keep_prob=lstm_keep_probs[0]) lstm1 = DropoutWrapper(lstm1, output_keep_prob=lstm_keep_probs[1]) self.rnn = MultiRNNCell([lstm0, lstm1]) self.zero_state = self.rnn.zero_state(self.batch_size, tf.float32) # first decompose state input into the two layers states0 = self.lstm_states[0, ...] states0 = self.lstm_states[1, ...] # then retrieve two memory_size-sized tensors from each state item states0_list = tf.unstack(states0, num=2) cell_state0 = states0_list[0] hidden_state0 = states0_list[1] states1_list = tf.unstack(states0, num=2) cell_state1 = states1_list[0] hidden_state1 = states1_list[1] # finally, create the state tuples state0 = LSTMStateTuple(c=hidden_state0, h=cell_state0) state1 = LSTMStateTuple(c=hidden_state1, h=cell_state1) sequence_lengths = tf.ones( (self.batch_size, ), dtype=tf.int32) * sequence_length rnn_outputs, rnn_state = static_rnn( self.rnn, rnn_inputs, dtype=tf.float32, initial_state=(state0, state1), sequence_length=sequence_lengths) rnn_outputs = tf.reshape( tf.concat(rnn_outputs, 1), [self.batch_size, sequence_length, memory_size]) self.rnn_state = tensor_from_lstm_tuple(rnn_state) ############################################################################################ # Output layer # ############################################################################################ with tf.variable_scope('feedforward'): n_rnn_output = memory_size # number of activations per batch kernel_initializer = tf.random_normal_initializer( stddev=np.sqrt(2 / n_rnn_output)) # predictions # I know we shouldn't use tf.layers but since dense + conv are easy to implement by # oneself we decided to remove this possible source of errors and focus on the many # others y = tf.layers.dense(rnn_outputs, 6, kernel_initializer=kernel_initializer) # decompose into translational and rotational component self.y_t, self.y_r = tf.split(y, 2, axis=2) self.x_t, self.x_r = tf.split(self.target_poses, 2, axis=2) self.predictions = (self.y_t, self.y_r) self.loss = self.loss_function((self.x_t, self.x_r), (self.y_t, self.y_r)) with tf.variable_scope('optimizer'): self.train_step = optimizer.minimize(self.loss)