def train_option(self, bootstrap_value_mix): rollout = np.array(self.episode_buffer_option) observations = rollout[:, 0] option_directions = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] next_observations = rollout[:, 5] """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory""" eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = reward_discount(eigen_rewards_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_eigen_return: discounted_eigen_returns, self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions, self.local_network.matrix_sf: [self.global_network.sf_matrix_buffer] # self.local_network.current_option_direction: option_directions, } """Do an update on the intra-option policies""" _, self.summaries_option = self.sess.run([ self.local_network.apply_grads_option, self.local_network.merged_summary_option, ], feed_dict=feed_dict) """Store the bootstrap target returns at the end of the trajectory""" self.eigen_R = discounted_eigen_returns[-1]
def train_option(self, bootstrap_value, bootstrap_value_mix): rollout = np.array( self.episode_buffer_option) # s, self.option, self.action, r, r_i observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] primitive_actions = rollout[:, 5] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1] options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \ observations1, observations2 = [], [], [], [], [], [], [], [] if self.config.eigen: eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = discount(eigen_rewards_plus, self.config.discount)[:-1] discounted_eigen_returns1, discounted_eigen_returns2 = [], [] for i, primitive in enumerate(primitive_actions): if primitive: options1.append(options[i]) actions1.append(actions[i]) discounted_returns1.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns1.append( discounted_eigen_returns[i]) observations1.append(observations[i]) else: options2.append(options[i]) actions2.append(actions[i]) discounted_returns2.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns2.append( discounted_eigen_returns[i]) observations2.append(observations[i]) if len(observations1) > 0: feed_dict = { self.local_network.target_return: discounted_returns1, self.local_network.observation: np.stack(observations1, axis=0), self.local_network.options_placeholder: options1 } to_run = [self.local_network.apply_grads_primitive_option] _ = self.sess.run(to_run, feed_dict=feed_dict) if len(observations2) > 0: feed_dict = { self.local_network.target_return: discounted_returns2, self.local_network.observation: np.stack(observations2, axis=0), self.local_network.actions_placeholder: actions2, self.local_network.options_placeholder: options2 } to_run = [ self.local_network.apply_grads_option, self.local_network.merged_summary_option, self.local_network.option_loss, self.local_network.policy_loss, self.local_network.entropy_loss, self.local_network.critic_loss, self.local_network.term_loss ] if self.config.eigen: feed_dict[self.local_network. target_eigen_return] = discounted_eigen_returns2 to_run.append(self.local_network.eigen_critic_loss) results = self.sess.run(to_run, feed_dict=feed_dict) results.append(discounted_returns[-1]) if self.config.eigen: results.append(discounted_eigen_returns[-1]) else: return None return results[1:]
def train_option(self, bootstrap_value, bootstrap_value_mix): rollout = np.array(self.episode_buffer_option) observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] primitive_actions = rollout[:, 5] next_observations = rollout[:, 6] """Construct list of discounted returns for the entire n-step trajectory""" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1] """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory""" eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = reward_discount(eigen_rewards_plus, self.config.discount)[:-1] """Get the real directions executed in the environment, not the ones corresponding to the options of the high-level policy, since the former might not be the ones that need to be assigned credit for the return""" feed_dict = { self.local_network.observation: np.concatenate((np.stack(observations, 0), np.stack(next_observations, 0)), axis=0)} fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict) fi_next = fi[len(observations):] fi = fi[:len(observations)] real_directions = fi_next - fi real_approx_options, directions = [], [] for i, d in enumerate(real_directions): if primitive_actions[i]: real_approx_options.append(options[i]) directions.append(np.zeros((self.config.sf_layers[-1]))) else: directions.append(self.global_network.directions[options[i]]) real_approx_options.append(np.argmax([self.cosine_similarity(d, self.directions[o]) for o in range(self.nb_options)]) if self.total_episodes > 0 else options[i]) """Do an update on the option-value function critic""" feed_dict = {self.local_network.target_return: discounted_returns, self.local_network.observation: np.stack(observations, axis=0), # self.local_network.options_placeholder: real_approx_options, self.local_network.options_placeholder: options, # self.local_network.option_direction_placeholder: real_directions self.local_network.option_direction_placeholder: directions } _, self.summaries_critic = self.sess.run([self.local_network.apply_grads_critic, self.local_network.merged_summary_critic, ], feed_dict=feed_dict) """Do an update on the option termination conditions""" feed_dict = { self.local_network.observation: np.stack(next_observations, axis=0), # self.local_network.options_placeholder: real_approx_options, self.local_network.options_placeholder: options, # self.local_network.option_direction_placeholder: real_directions, self.local_network.option_direction_placeholder: directions, self.local_network.primitive_actions_placeholder: primitive_actions } _, self.summaries_termination = self.sess.run([self.local_network.apply_grads_term, self.local_network.merged_summary_term, ], feed_dict=feed_dict) feed_dict = {self.local_network.target_return: discounted_returns, self.local_network.target_eigen_return: discounted_eigen_returns, self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions, self.local_network.options_placeholder: options, self.local_network.option_direction_placeholder: directions, self.local_network.primitive_actions_placeholder: primitive_actions } """Do an update on the intra-option policies""" _, self.summaries_option = self.sess.run([self.local_network.apply_grads_option, self.local_network.merged_summary_option, ], feed_dict=feed_dict) """Store the bootstrap target returns at the end of the trajectory""" self.R = discounted_returns[-1] self.eigen_R = discounted_eigen_returns[-1]
def train_option(self, bootstrap_value, bootstrap_value_mix): # rollout = np.array(self.episode_buffer_option) observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] primitive_actions = rollout[:, 5] next_observations = rollout[:, 6] """Construct list of discounted returns for the entire n-step trajectory""" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1] """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory""" eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = reward_discount(eigen_rewards_plus, self.config.discount)[:-1] """Do an update on the option-value function critic""" feed_dict = { self.local_network.target_return: discounted_returns, self.local_network.observation: np.stack(observations, axis=0), self.local_network.options_placeholder: options, } _, self.summaries_critic = self.sess.run([ self.local_network.apply_grads_critic, self.local_network.merged_summary_critic, ], feed_dict=feed_dict) """Do an update on the option termination conditions""" feed_dict = { self.local_network.observation: np.stack(next_observations, axis=0), self.local_network.options_placeholder: options, self.local_network.primitive_actions_placeholder: primitive_actions } _, self.summaries_termination = self.sess.run([ self.local_network.apply_grads_term, self.local_network.merged_summary_term, ], feed_dict=feed_dict) """Do an update on the intra-option policies""" feed_dict = { self.local_network.target_return: discounted_returns, self.local_network.target_eigen_return: discounted_eigen_returns, self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions, self.local_network.options_placeholder: options, self.local_network.primitive_actions_placeholder: primitive_actions } _, self.summaries_option = self.sess.run([ self.local_network.apply_grads_option, self.local_network.merged_summary_option, ], feed_dict=feed_dict) """Store the bootstrap target returns at the end of the trajectory""" self.R = discounted_returns[-1] self.eigen_R = discounted_eigen_returns[-1]