def core_net(self, inputs_im): self.layers['loc_mean'] = [] self.layers['loc_sample'] = [] self.layers['rnn_outputs'] = [] self.layers['retina_reprsent'] = [] cell_size = 256 batch_size = tf.shape(inputs_im)[0] init_loc_mean = tf.ones((batch_size, 2)) loc_sample = tf.random_uniform((batch_size, 2), minval=-1, maxval=1) glimpse_out = self.glimpse_net(inputs_im, loc_sample) if self.is_training: inputs_im = tf.tile(inputs_im, [self._n_l_sample, 1, 1, 1]) glimpse_out = tf.tile(glimpse_out, [self._n_l_sample, 1]) batch_size = tf.shape(glimpse_out)[0] init_loc_mean = tf.tile(init_loc_mean, [self._n_l_sample, 1]) loc_sample = tf.tile(loc_sample, [self._n_l_sample, 1]) self.layers['loc_mean'].append(init_loc_mean) self.layers['loc_sample'].append(loc_sample) # RNN of core net h_prev = tf.zeros((batch_size, cell_size)) for step_id in range(0, self._n_step): with tf.variable_scope('core_net'): h = tf.nn.relu(L.Linear(h_prev, cell_size, 'lh') + L.Linear(glimpse_out, cell_size, 'lg'), name='h') # core net does not trained through locatiion net loc_mean = self.location_net(tf.stop_gradient(h)) if self.is_training: loc_sample = tf.stop_gradient( sample_normal_single(loc_mean, stddev=self._l_std)) else: loc_sample = tf.stop_gradient( sample_normal_single(loc_mean, stddev=self._l_std)) glimpse_out = self.glimpse_net(inputs_im, loc_sample) action = self.action_net(h) # do not restore the last step location if step_id < self._n_step - 1: self.layers['loc_mean'].append(loc_mean) self.layers['loc_sample'].append(loc_sample) self.layers['rnn_outputs'].append(h) h_prev = h self.layers['class_logists'] = action self.layers['prob'] = tf.nn.softmax(logits=action, name='prob') self.layers['pred'] = tf.argmax(action, axis=1)
def glimpse_net(self, inputs, l_sample): """ Args: inputs: [batch, h, w, c] l_sample: [batch, 2] """ with tf.name_scope('glimpse_sensor'): max_r = int(self._g_size * (2**(self._g_n - 2))) inputs_pad = tf.pad( inputs, [[0, 0], [max_r, max_r], [max_r, max_r], [0, 0]], 'CONSTANT') # TODO use clipped location to compute prob or not? l_sample = tf.clip_by_value(l_sample, -1.0, 1.0) if self._is_transform: l_sample_adj = l_sample * 1.0 * self._unit_pixel / ( self._im_size / 2 + max_r) else: l_sample_adj = l_sample * 1.0 * self._unit_pixel / ( self._im_size / 2 + max_r) retina_reprsent = [] for g_id in range(0, self._g_n): cur_size = self._g_size * (2**g_id) cur_glimpse = tf.image.extract_glimpse( inputs_pad, size=[cur_size, cur_size], offsets=l_sample_adj, centered=True, normalized=True, uniform_noise=True, name='glimpse_sensor', ) cur_glimpse = tf.image.resize_images( cur_glimpse, size=[self._g_size, self._g_size], method=tf.image.ResizeMethod.BILINEAR, align_corners=False, ) retina_reprsent.append(cur_glimpse) retina_reprsent = tf.concat(retina_reprsent, axis=-1) self.layers['retina_reprsent'].append(retina_reprsent) with tf.variable_scope('glimpse_net'): out_dim = 128 hg = L.Linear(retina_reprsent, out_dim, name='hg', nl=tf.nn.relu) hl = L.Linear(l_sample, out_dim, name='hl', nl=tf.nn.relu) out_dim = 256 g = tf.nn.relu(L.Linear(hl, out_dim, 'lhg') + L.Linear(hg, out_dim, 'lhl'), name='g') return g
def _comp_baselines(self): with tf.variable_scope('baseline'): # core net does not trained through baseline loss rnn_outputs = tf.stop_gradient(self.layers['rnn_outputs']) baselines = [] for step_id in range(0, self._n_step - 1): b = L.Linear(rnn_outputs[step_id], 1, name='baseline') b = tf.squeeze(b, axis=-1) baselines.append(b) baselines = tf.stack(baselines) # [n_step, b_size] baselines = tf.transpose(baselines) # [b_size, n_step] return baselines
def action_net(self, core_state): with tf.variable_scope('act_net'): act = L.Linear(core_state, self._n_class, name='act') return act
def location_net(self, core_state): with tf.variable_scope('loc_net'): l_mean = L.Linear(core_state, 2, name='l_mean') # l_mean = tf.tanh(l_mean) l_mean = tf.clip_by_value(l_mean, -1., 1.) return l_mean