예제 #1
0
    def _build_module(self, input_layer):
        if hasattr(self.ap.task_parameters, 'checkpoint_restore_path') and\
                self.ap.task_parameters.checkpoint_restore_path:
            self.DND = differentiable_neural_dictionary.load_dnd(
                self.ap.task_parameters.checkpoint_restore_path)
        else:
            self.DND = differentiable_neural_dictionary.QDND(
                self.DND_size,
                input_layer.get_shape()[-1],
                self.num_actions,
                self.new_value_shift_coefficient,
                key_error_threshold=self.DND_key_error_threshold,
                learning_rate=self.network_parameters.learning_rate,
                num_neighbors=self.number_of_nn,
                override_existing_keys=True)

        # Retrieve info from DND dictionary
        # We assume that all actions have enough entries in the DND
        self.q_values = self.output = tf.transpose([
            self._q_value(input_layer, action)
            for action in range(self.num_actions)
        ])

        # used in batch-rl to estimate a probablity distribution over actions
        self.softmax = self.add_softmax_with_temperature()
예제 #2
0
    def _build_module(self, input_layer):

        if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir'
                   ) and self.ap.task_parameters.checkpoint_restore_dir:
            self.DND = differentiable_neural_dictionary.load_dnd(
                self.ap.task_parameters.checkpoint_restore_dir)
        else:
            print(input_layer.get_shape()[-1])
            self.DND = differentiable_neural_dictionary.QDND(
                self.DND_size,
                input_layer.get_shape()[-1],
                1,
                self.new_value_shift_coefficient,
                key_error_threshold=self.DND_key_error_threshold,
                learning_rate=self.network_parameters.learning_rate,
                num_neighbors=self.number_of_nn,
                override_existing_keys=True)

        # state value tower - V

        with tf.variable_scope("state_value"):
            #self.state_value = self.dense_layer(512)(input_layer, activation=self.activation_function, name='fc1')
            #self.state_value = self.dense_layer(1)(self.state_value, name='fc2')
            #self.state_value = self.dense_layer(64)(input_layer, activation=None, name='fc1')
            self.state_value = self._v_value(input_layer)

        # action advantage tower - A
        with tf.variable_scope("action_advantage"):
            self.action_advantage = self.dense_layer(64)(
                input_layer, activation=self.activation_function, name='fc1')
            self.action_advantage = self.dense_layer(self.num_actions)(
                self.action_advantage, name='fc2')
            self.action_mean = tf.reduce_mean(self.action_advantage,
                                              axis=1,
                                              keepdims=True)
            self.action_advantage = self.action_advantage - self.action_mean

        # merge to state-action value function Q
        self.output = tf.add(tf.expand_dims(self.state_value, axis=1),
                             self.action_advantage,
                             name='output')