예제 #1
0
 def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
     """
     Creates a function for vanilla policy training with a continuous action space
     """
     self.act_holders = tf.placeholder(tf.float32, shape=[None, self.out_op.shape[1].value])
     self.reward_holders = tf.placeholder(tf.float32, shape=[None])
     
     self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32)
     self.out_act = self.out_op + tf.random_normal(tf.shape(self.out_op), dtype=tf.float32) * self.std
     
     self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std)
     
     self.advantages = self.reward_holders - tf.squeeze(self.v_out_op)
     
     self.actor_loss = -tf.reduce_mean(self.log_probs * self.advantages)
     
     self.optimizer = optimizer
     self.actor_update = self.optimizer.minimize(self.actor_loss)
     
     with tf.control_dependencies([self.actor_update]):
         self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op)))
         self.value_update = self.optimizer.minimize(self.value_loss)
     
     update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                    feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                         self.act_holders: reshape_train_var(train_data[:, 1]),
                                                         self.reward_holders: train_data[:, 2]})
     
     self.sess.run(tf.global_variables_initializer())
     
     return update_func
예제 #2
0
 def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
     """
     Creates a function for vanilla policy training with a discrete action space
     """
     self.act_holders = tf.placeholder(tf.int32, shape=[None])
     self.reward_holders = tf.placeholder(tf.float32, shape=[None])
     
     self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32)
     self.log_probs = tf.log(self.out_op)
     
     self.advantages = self.reward_holders - self.v_out_op
     
     self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
     self.loss = -tf.reduce_mean(self.resp_acts * self.advantages)
     
     self.optimizer = optimizer
     self.actor_update = self.optimizer.minimize(self.loss)
     
     with tf.control_dependencies([self.actor_update]):
         self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op)))
         self.value_update = self.optimizer.minimize(self.value_loss)
     
     update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                    feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                         self.act_holders: reshape_train_var(train_data[:, 1]),
                                                         self.reward_holders: train_data[:, 2]})
     
     self.sess.run(tf.global_variables_initializer())
     
     return update_func
예제 #3
0
 def update_func(train_data):
     self.old_probs, self.old_advantages = self.sess.run([self.resp_acts, self.advantages], 
                             feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                        self.act_holders: train_data[:, 1],
                                        self.reward_holders: train_data[:, 2]})
 
     for i in range(self.ppo_iters):
         kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], 
                        feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                             self.act_holders: reshape_train_var(train_data[:, 1]),
                             self.reward_holders: train_data[:, 2],
                             self.old_prob_holders: self.old_probs,
                             self.advatange_holders: self.old_advantages})
         if kl_div > 1.5 * self.target_kl:
             break