Пример #1
0
    def init_network(self):
        # The model structure
        # TODO: how to handle reuse, it would register loss two times
        self.train_policy = self.policy(self.sess,
                                        self.X_input_train_shape,
                                        self.num_actions,
                                        self.layer_collection,
                                        reuse=False)

        self.step_policy = self.policy(self.sess,
                                       self.X_input_step_shape,
                                       self.num_actions,
                                       reuse=True)

        with tf.variable_scope('train_output'):
            negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.train_policy.policy_logits, labels=self.actions)
            self.policy_gradient_loss = tf.reduce_mean(
                self.advantage * negative_log_prob_action)
            self.value_function_loss = tf.reduce_mean(
                mse(tf.squeeze(self.train_policy.value_function), self.reward))
            self.entropy = tf.reduce_mean(
                openai_entropy(self.train_policy.policy_logits))
            self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff

            # Gradient Clipping
            params = find_trainable_variables("policy")
            grads = tf.gradients(self.loss, params)

            # Apply Gradients
            grads = list(zip(grads, params))
            optimizer = opt.KfacOptimizer(
                learning_rate=self.learning_rate,
                cov_ema_decay=self.moving_average,
                damping=self.damping,
                layer_collection=self.layer_collection,
                norm_constraint=self.kl_clip,
                momentum=self.momentum)
            self.optimize = optimizer.apply_gradients(grads)
            self.cov_update_op = optimizer.cov_update_op
            self.inv_update_op = optimizer.inv_update_op
            self.inv_update_dict = optimizer.inv_updates_dict
            self.factors = self.layer_collection.get_factors()
Пример #2
0
    def init_network(self):
        # The model structure
        self.step_policy = self.policy(self.sess,
                                       self.X_input_step_shape,
                                       self.num_actions,
                                       reuse=False,
                                       is_training=False)

        self.train_policy = self.policy(self.sess,
                                        self.X_input_train_shape,
                                        self.num_actions,
                                        reuse=True,
                                        is_training=self.is_training)

        with tf.variable_scope('train_output'):
            negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.train_policy.policy_logits, labels=self.actions)
            self.policy_gradient_loss = tf.reduce_mean(
                self.advantage * negative_log_prob_action)
            self.value_function_loss = tf.reduce_mean(
                mse(tf.squeeze(self.train_policy.value_function), self.reward))
            self.entropy = tf.reduce_mean(
                openai_entropy(self.train_policy.policy_logits))
            self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff

            # Gradient Clipping
            params = find_trainable_variables("policy")
            grads = tf.gradients(self.loss, params)
            if self.max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(
                    grads, self.max_grad_norm)

            # Apply Gradients
            grads = list(zip(grads, params))
            optimizer = tf.train.RMSPropOptimizer(
                learning_rate=self.learning_rate,
                decay=self.alpha,
                epsilon=self.epsilon)

            # ADDED
            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                self.optimize = optimizer.apply_gradients(grads)
Пример #3
0
    def prepare_loss(self):
        self.X_input_train_shape = (None, self.img_height, self.img_width,
                                    self.num_classes * self.num_stack)
        self.X_input_step_shape = (None, self.img_height, self.img_width,
                                   self.num_classes * self.num_stack)

        self.actions = tf.placeholder(tf.int32, [None])  # actions
        self.advantage = tf.placeholder(tf.float32,
                                        [None])  # advantage function
        self.reward = tf.placeholder(tf.float32, [None])  # reward
        self.learning_rate = tf.placeholder(tf.float32, [])  # learning rate
        self.is_training = tf.placeholder(tf.bool)  # is_training

        # The model structure
        self.actor_network = self.policy(self.sess,
                                         self.X_input_step_shape,
                                         self.num_actions,
                                         reuse=False,
                                         is_training=False)

        self.critic_network = self.policy(self.sess,
                                          self.X_input_train_shape,
                                          self.num_actions,
                                          reuse=True,
                                          is_training=self.is_training)

        with tf.variable_scope('train_output'):
            negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.critic_network.policy_logits, labels=self.actions)
            self.policy_gradient_loss = tf.reduce_mean(
                self.advantage * negative_log_prob_action)
            self.value_function_loss = tf.reduce_mean(
                mse(tf.squeeze(self.critic_network.value_function),
                    self.reward))
            self.entropy = tf.reduce_mean(
                openai_entropy(self.critic_network.policy_logits))
            self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff

            # Gradient Clipping
            params = find_trainable_variables("policy")
            grads = tf.gradients(self.loss, params)
            if self.max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(
                    grads, self.max_grad_norm)
            # Apply Gradients
            grads = list(zip(grads, params))
            optimizer = tf.train.RMSPropOptimizer(
                learning_rate=self.learning_rate,
                decay=self.alpha,
                epsilon=self.epsilon)
            self.optimize = optimizer.apply_gradients(grads)

            # monitor training
            summaries = []
            summaries.append(
                tf.summary.scalar('loss/policy_gradient_loss',
                                  self.policy_gradient_loss))
            summaries.append(
                tf.summary.scalar('loss/value_function_loss',
                                  self.value_function_loss))
            summaries.append(tf.summary.scalar('loss/entropy', self.entropy))
            summaries.append(tf.summary.scalar('loss/total_loss', self.loss))
            summaries.append(tf.summary.scalar('train/gradnorm', grad_norm))
            self.summary = tf.summary.merge(summaries)