def _build_module(self, input_layer): if hasattr(self.ap.task_parameters, 'checkpoint_restore_path') and\ self.ap.task_parameters.checkpoint_restore_path: self.DND = differentiable_neural_dictionary.load_dnd( self.ap.task_parameters.checkpoint_restore_path) else: self.DND = differentiable_neural_dictionary.QDND( self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient, key_error_threshold=self.DND_key_error_threshold, learning_rate=self.network_parameters.learning_rate, num_neighbors=self.number_of_nn, override_existing_keys=True) # Retrieve info from DND dictionary # We assume that all actions have enough entries in the DND self.q_values = self.output = tf.transpose([ self._q_value(input_layer, action) for action in range(self.num_actions) ]) # used in batch-rl to estimate a probablity distribution over actions self.softmax = self.add_softmax_with_temperature()
def _build_module(self, input_layer): if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir' ) and self.ap.task_parameters.checkpoint_restore_dir: self.DND = differentiable_neural_dictionary.load_dnd( self.ap.task_parameters.checkpoint_restore_dir) else: print(input_layer.get_shape()[-1]) self.DND = differentiable_neural_dictionary.QDND( self.DND_size, input_layer.get_shape()[-1], 1, self.new_value_shift_coefficient, key_error_threshold=self.DND_key_error_threshold, learning_rate=self.network_parameters.learning_rate, num_neighbors=self.number_of_nn, override_existing_keys=True) # state value tower - V with tf.variable_scope("state_value"): #self.state_value = self.dense_layer(512)(input_layer, activation=self.activation_function, name='fc1') #self.state_value = self.dense_layer(1)(self.state_value, name='fc2') #self.state_value = self.dense_layer(64)(input_layer, activation=None, name='fc1') self.state_value = self._v_value(input_layer) # action advantage tower - A with tf.variable_scope("action_advantage"): self.action_advantage = self.dense_layer(64)( input_layer, activation=self.activation_function, name='fc1') self.action_advantage = self.dense_layer(self.num_actions)( self.action_advantage, name='fc2') self.action_mean = tf.reduce_mean(self.action_advantage, axis=1, keepdims=True) self.action_advantage = self.action_advantage - self.action_mean # merge to state-action value function Q self.output = tf.add(tf.expand_dims(self.state_value, axis=1), self.action_advantage, name='output')