def train(self, rollout, sess, bootstrap_value, bootstrap_sf, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] timesteps = rollout[:, 4] done = rollout[:, 5] option_term = rollout[:, 6] values = rollout[:, 7] q_values = rollout[:, 8] niu = rollout[:, 9] sf = rollout[:, 10] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) sf_plus = np.asarray(sf.tolist() + [bootstrap_sf]) discounted_rewards = discount(rewards_plus, self.config.discount)[:-1] discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_return: discounted_rewards, self.local_network.target_r: rewards, self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.delib: niu, self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions, self.local_network.options_placeholder: options } _, ms, img_summ, loss, policy_loss, entropy_loss, sf_loss, instant_r_loss, auto_loss, term_loss = \ sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.image_summaries, self.local_network.loss, self.local_network.policy_loss, self.local_network.entropy_loss, self.local_network.sf_loss, self.local_network.instant_r_loss, self.local_network.auto_loss, self.local_network.term_loss], feed_dict=feed_dict) # sess.run(self.update_local_vars) return ms, img_summ, loss, policy_loss, entropy_loss, sf_loss, instant_r_loss, auto_loss, term_loss
def train(self, rollout, sess, bootstrap_sf, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] fi = rollout[:, 1] next_observations = rollout[:, 2] actions = rollout[:, 3] sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0), self.local_network.target_next_obs: np.stack(next_observations, axis=0), self.local_network.actions_placeholder: actions } _, ms, loss, sf_loss, aux_loss = \ sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.loss, self.local_network.sf_loss, self.local_network.aux_loss], feed_dict=feed_dict) return ms, loss, sf_loss, aux_loss
def train(self, rollout, sess, bootstrap_value, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] timesteps = rollout[:, 3] values = rollout[:, 5] # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_return: discounted_rewards, self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions } _, ms, img_summ, loss, option_policy_loss, option_entropy_loss, option_critic_loss = \ sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.image_summaries, self.local_network.loss, self.local_network.option_policy_loss, self.local_network.option_entropy_loss, self.local_network.option_critic_loss], feed_dict=feed_dict) return ms, img_summ, loss, option_policy_loss, option_entropy_loss, option_critic_loss
def train_sf(self, bootstrap_sf): rollout = np.array(self.episode_buffer_sf) try: observations = rollout[:, 0] except: print("Dasdas") feed_dict = { self.local_network.observation: np.stack(observations, axis=0) } fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict) sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0) } # , _, ms, sf_loss = \ self.sess.run([self.local_network.apply_grads_sf, self.local_network.merged_summary_sf, self.local_network.sf_loss], feed_dict=feed_dict) return ms, sf_loss
def train(self, rollout, sess, bootstrap_sf, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] # actions = rollout[:, 1] # sf = rollout[:, 2] # fi = rollout[:, 3] fi = np.identity(self.nb_states)[observations] sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.identity(self.nb_states)[observations] } _, ms, loss, sf_loss = \ sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.loss, self.local_network.sf_loss], feed_dict=feed_dict) return ms, loss, sf_loss
def train_option(self, bootstrap_value, bootstrap_value_mix): rollout = np.array( self.episode_buffer_option) # s, self.option, self.action, r, r_i observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] primitive_actions = rollout[:, 5] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1] options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \ observations1, observations2 = [], [], [], [], [], [], [], [] if self.config.eigen: eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = discount(eigen_rewards_plus, self.config.discount)[:-1] discounted_eigen_returns1, discounted_eigen_returns2 = [], [] for i, primitive in enumerate(primitive_actions): if primitive: options1.append(options[i]) actions1.append(actions[i]) discounted_returns1.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns1.append( discounted_eigen_returns[i]) observations1.append(observations[i]) else: options2.append(options[i]) actions2.append(actions[i]) discounted_returns2.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns2.append( discounted_eigen_returns[i]) observations2.append(observations[i]) if len(observations1) > 0: feed_dict = { self.local_network.target_return: discounted_returns1, self.local_network.observation: np.stack(observations1, axis=0), self.local_network.options_placeholder: options1 } to_run = [self.local_network.apply_grads_primitive_option] _ = self.sess.run(to_run, feed_dict=feed_dict) if len(observations2) > 0: feed_dict = { self.local_network.target_return: discounted_returns2, self.local_network.observation: np.stack(observations2, axis=0), self.local_network.actions_placeholder: actions2, self.local_network.options_placeholder: options2 } to_run = [ self.local_network.apply_grads_option, self.local_network.merged_summary_option, self.local_network.option_loss, self.local_network.policy_loss, self.local_network.entropy_loss, self.local_network.critic_loss, self.local_network.term_loss ] if self.config.eigen: feed_dict[self.local_network. target_eigen_return] = discounted_eigen_returns2 to_run.append(self.local_network.eigen_critic_loss) results = self.sess.run(to_run, feed_dict=feed_dict) results.append(discounted_returns[-1]) if self.config.eigen: results.append(discounted_eigen_returns[-1]) else: return None return results[1:]