def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()): """ Creates a function for vanilla policy training with a continuous action space """ self.act_holders = tf.placeholder(tf.float32, shape=[None, self.out_op.shape[1].value]) self.reward_holders = tf.placeholder(tf.float32, shape=[None]) self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32) self.out_act = self.out_op + tf.random_normal(tf.shape(self.out_op), dtype=tf.float32) * self.std self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std) self.advantages = self.reward_holders - tf.squeeze(self.v_out_op) self.actor_loss = -tf.reduce_mean(self.log_probs * self.advantages) self.optimizer = optimizer self.actor_update = self.optimizer.minimize(self.actor_loss) with tf.control_dependencies([self.actor_update]): self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op))) self.value_update = self.optimizer.minimize(self.value_loss) update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], feed_dict={self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2]}) self.sess.run(tf.global_variables_initializer()) return update_func
def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()): """ Creates a function for vanilla policy training with a discrete action space """ self.act_holders = tf.placeholder(tf.int32, shape=[None]) self.reward_holders = tf.placeholder(tf.float32, shape=[None]) self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32) self.log_probs = tf.log(self.out_op) self.advantages = self.reward_holders - self.v_out_op self.resp_acts = tf.reduce_sum(self.act_masks * self.log_probs, axis=1) self.loss = -tf.reduce_mean(self.resp_acts * self.advantages) self.optimizer = optimizer self.actor_update = self.optimizer.minimize(self.loss) with tf.control_dependencies([self.actor_update]): self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op))) self.value_update = self.optimizer.minimize(self.value_loss) update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], feed_dict={self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2]}) self.sess.run(tf.global_variables_initializer()) return update_func
def update_func(train_data): self.old_probs, self.old_advantages = self.sess.run([self.resp_acts, self.advantages], feed_dict={self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: train_data[:, 1], self.reward_holders: train_data[:, 2]}) for i in range(self.ppo_iters): kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], feed_dict={self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2], self.old_prob_holders: self.old_probs, self.advatange_holders: self.old_advantages}) if kl_div > 1.5 * self.target_kl: break