def infer(self, rollout, sess, gamma, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] values = rollout[:, 5] # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = utils.discount(rewards_plus, gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * value_plus[1:] - value_plus[:-1] advantages = utils.discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save feed_dict = { self.local_AC_network.target_v: discounted_rewards, self.local_AC_network.inputs: np.stack(observations), self.local_AC_network.actions: actions, self.local_AC_network.advantages: advantages } l, v_l, p_l, e_l, g_n, v_n, _ = sess.run([ self.local_AC_network.loss, self.local_AC_network.value_loss, self.local_AC_network.policy_loss, self.local_AC_network.entropy, self.local_AC_network.grad_norms, self.local_AC_network.var_norms, self.local_AC_network.apply_grads ], feed_dict=feed_dict) return l / len(rollout), v_l / len(rollout), p_l / len( rollout), e_l / len(rollout), g_n, v_n
def train(self, rollout, sess, gamma, bootstrap_value): rollout = np.array(rollout) states = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() values = rollout[:, 3] self.pr = prev_rewards self.pa = prev_actions # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = ut.discount(self.rewards_plus, gamma)[:-1] self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards +\ gamma * self.value_plus[1:] -\ self.value_plus[:-1] advantages = ut.discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save rnn_state = self.local_AC.st_init if self.network == 'lstm': feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.state: np.stack(states, axis=0), self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.advantages: advantages, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } elif (self.network == 'relu') or\ (self.network == 'gru') or\ (self.network == 'ugru'): feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.st: np.stack(states, axis=0), self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.advantages: advantages, self.local_AC.st_in: rnn_state } v_l, p_l, e_l, g_n, v_n, _ = sess.run([ self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads ], feed_dict=feed_dict) aux = len(rollout) return v_l / aux, p_l / aux, e_l / aux, g_n, v_n
def train(self, rollout, sess, bootstrap_value, settings, summaries=False): rollout = np.array(rollout) actions = rollout[:, 0] rewards = rollout[:, 1] timesteps = rollout[:, 2] if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() values = rollout[:, 4] # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, settings["gamma"])[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] if FLAGS.gen_adv: td_residuals = rewards + settings["gamma"] * value_plus[ 1:] - value_plus[:-1] advantages = discount(td_residuals, settings["gamma"]) policy_target = advantages rnn_state = self.local_AC.state_init if FLAGS.meta: feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.timestep: np.vstack(timesteps), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } else: feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.timestep: np.vstack(timesteps), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } if summaries: l, v_l, p_l, e_l, g_n, v_n, _, ms = sess.run([ self.local_AC.loss, self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary ], feed_dict=feed_dict) return l / len(rollout), v_l / len(rollout), p_l / len( rollout), e_l / len(rollout), g_n, v_n, ms else: _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def train(self, rollout, sess, bootstrap_value, settings, summaries=False): rollout = np.array(rollout) actions = rollout[:, 0] rewards = rollout[:, 1] timesteps = rollout[:, 2] if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() values = rollout[:, 4] reward_multiplier = [10 for _ in prev_rewards] # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, settings["gamma"])[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] if FLAGS.gen_adv: td_residuals = rewards + settings["gamma"] * value_plus[1:] - value_plus[:-1] advantages = discount(td_residuals, settings["gamma"]) policy_target = advantages rnn_state = self.local_AC.state_init if FLAGS.meta: feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.prev_rewards: prev_rewards, self.local_AC.reward_multiplier: reward_multiplier, self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.timestep: np.vstack(timesteps), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]} else: feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.timestep: np.vstack(timesteps), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]} if summaries: l, v_l, p_l, e_l, g_n, v_n, _, ms = sess.run([self.local_AC.loss, self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary], feed_dict=feed_dict) return l / len(rollout), v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n, ms else: _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def run_n_step(self): obs_list,action_list,reward_list,next_obs_list,value_list,rnn_states = [],[],[],[],[],[] try: if self.obs == None: self.obs = self.env.reset() self.rnn_state = self.zero_rnn_init except: ValueError for _ in range(self.n_steps): obs_list.append(self.obs) rnn_states.append(self.rnn_state) action, reward, next_obs, self.done, value, next_rnn_state = self._run_one_step( ) action_list.append(action) reward_list.append(reward) next_obs_list.append(next_obs) value_list.append(value) if self.done: ## reset env self.obs = self.env.reset() self.rnn_state = self.zero_rnn_init ## write reward and length to server variables self.server.last_reward.value = int(self.total_reward) self.server.last_length.value = int(self.total_length) self.total_length = 0 self.total_reward = 0 break else: self.obs = next_obs self.rnn_state = next_rnn_state ## caculate ref values and adv if self.done: value_p1 = 0 else: self.predict_q.put((self.id, self.obs, (self.rnn_state[0][0], self.rnn_state[1][0]))) _, value_p1, _ = self.return_q.get() rewards_plus = np.asarray(reward_list + [value_p1]) reward_list = discount(rewards_plus, self.gamma)[:-1] value_plus = np.asarray(value_list + [value_p1]) advs = rewards_plus[:-1] + self.gamma * value_plus[1:] - value_plus[:-1] lamb = 1 advs = discount(advs, self.gamma * lamb) self.training_q.put((np.array(obs_list), np.array(action_list), advs, reward_list, rnn_states[0]))
def _cal_adv_and_old_v(self, traj: Trajectories): """ Complete adv and old_v in traj """ v = self.critic(traj.s) v_s = v[:-1, :] v_s_next = v[1:, :] td = traj.r - v_s + self.parms.gamma * v_s_next * traj.not_done advs = np.zeros_like(td) # Find done index done_index = np.where(traj.not_done == 0)[0] # Cal adv for each intervel start_index = 0 multiplier = self.parms.gamma * self.parms.lamda for end_index in done_index: advs[start_index:end_index + 1, ...] = discount(td[start_index:end_index + 1, ...], multiplier) start_index = end_index + 1 traj.adv = advs traj.old_v = v_s.numpy()
def test_discount(self): rewards = np.array([5, 10, 15, 20, 30, 50]) gamma = 0.9 discounted_rewards = np.array([5, 9, 12.15, 14.58, 19.683, 29.5245]) self.assertTrue( np.allclose(discounted_rewards, utils.discount(rewards, gamma)))
def play_episode(self, train=True): # TODO state = self.env.reset(train) e_states = [] e_actions = [] e_rewards = [] e_newstates = [] e_dones = [] for step_count in range(MAX_STEPS): self.loss = 0 self.epsilon -= 1.0 / EXPLORE actions = self.actor.model.predict( state.reshape(1, 1, state.shape[0])).ravel() actions = self.noise_actions(actions, self.epsilon) new_state, reward, done = self.env.step(actions) e_states.append(state) e_actions.append(actions) e_rewards.append(reward) e_newstates.append(new_state) e_dones.append(done) state = new_state if not step_count % 50: logging.info("Step: {0} Reward: {1} Actions: {2}".format( step_count, reward, actions)) if done: self.save_weights() break # for st, act, rew, nst, d in zip(e_states, e_actions, discount(e_rewards).tolist(), e_newstates, e_dones): self.buff.add([e_states, e_actions, e_rewards, e_newstates, e_dones]) if train: self.loss = self._train_episode() logging.info("Total reward for episode: {0}".format( discount(e_rewards).sum())) return discount(e_rewards).sum()
def gae(self, rewards, values, next_vals, dones, gamma, lambda_): """ Performs Generalized Advantage Estimation rewards - torch FloatTensor of actual rewards collected. Size = L values - torch FloatTensor of value predictions. Size = L next_vals - torch FloatTensor of value predictions. Size = L dones - torch FloatTensor of done signals. Size = L gamma - float discount factor lambda_ - float gae moving average factor Returns advantages - torch FloatTensor of genralized advantage estimations. Size = L """ deltas = rewards + gamma * next_vals * (1 - dones) - values return cuda_if(discount(deltas, dones, gamma * lambda_))
def trajectory(batch): last_transition = batch[-1] is_not_terminal = (1 - last_transition['is_done']) target = sess.run(name_to_ops['value'], feed_dict={ name_to_ops['state']: last_transition['new_state'][np.newaxis] }) R = is_not_terminal * target batch = utils.chunk_maps(batch) clipped = np.clip(batch['reward'], -1, 1) rollout = np.append(clipped, R) discounted_reward = utils.discount(rollout, hyper_params['gamma']) batch['target'] = discounted_reward[:-1] return batch
def collect_trajs(self, video): """Run episodes and concatenate data.""" size = 0 trajs = [] lengths = [] for _ in range(self.episodes_per_batch): traj = self.do_episode(video) trajs.append(traj) length = len(traj["rewards"]) size += length lengths.append(length) obs = np.concatenate([traj["obs"] for traj in trajs]) rewards = np.concatenate([traj["rewards"] for traj in trajs]) actions = np.concatenate([traj["actions"] for traj in trajs]) returns = np.concatenate( [discount(traj["rewards"], self.gamma) for traj in trajs]) return dict(obs=obs, rewards=rewards, actions=actions, returns=returns, lengths=np.array(lengths))
if terminal: # Normalise rewards rewards = np.array(buffer_r) rewards = np.clip(rewards / rolling_r.std, -10, 10) batch_rewards = batch_rewards + buffer_r v_final = [ v * (1 - terminal) ] # v = 0 if terminal, otherwise use the predicted v values = np.array(buffer_v + v_final) terminals = np.array(buffer_terminal + [terminal]) # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438 delta = rewards + GAMMA * values[1:] * ( 1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) returns = advantage + np.array(buffer_v) # Per episode normalisation of advantages # advantage = (advantage - advantage.mean()) / np.maximum(advantage.std(), 1e-6) bs, ba, br, badv = np.reshape(buffer_s, (len(buffer_s),) + ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) experience.append([bs, ba, br, badv]) buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] # Update ppo if t >= BATCH: # Per batch normalisation of advantages advs = np.concatenate(list(zip(*experience))[3]) for x in experience:
# Track observation for when we return to this environment in the next episode (danananana sup snoop!) obs_bookmarks[i] = observation prev_bookmarks[i] = prev_obs break net.train(mode=True) print("T="+str(T),"– Episode", episode, "–– Avg Reward:", avg_reward, "–– Avg Action:", np.mean(actions)) if reward_count > 100 and avg_reward > rew_cutoff: rew_cutoff += 0.1 entropy_const *= .8 max_norm *= .8 clip_const *= .8 advantages = discount(advantages, gamma*lambda_, mask) # Generalized Value Estimation fit_batch_size = len(advantages)//n_minibatches data = [actions, observs, rewards, advantages, old_pis, old_vals, mask] fit_obj.fit_policy(net, data, optimizer, epochs=n_epochs, clip_const=clip_const, batch_size=fit_batch_size, entropy_const=entropy_const, val_const=val_const, gamma=gamma, lambda_=lambda_) if episode % (ep_batch_size*5) == 0: torch.save(net.state_dict(), net_save_file) torch.save(optimizer.state_dict(), optim_save_file) # Check for memory leaks gc.collect() max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("Memory Used: {:.2f} MB".format(max_mem_used / 1024)) episode_reward = 0
def train(self, rollout, bootstrap_value, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() values = rollout[:, 5] # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, FLAGS.gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] if FLAGS.gen_adv: td_residuals = rewards + FLAGS.gamma * value_plus[ 1:] - value_plus[:-1] advantages = discount(td_residuals, FLAGS.gamma) policy_target = advantages if FLAGS.lstm: if FLAGS.meta: rnn_state = self.local_AC.state_init feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } else: rnn_state = self.local_AC.state_init feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.actions: actions, self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } else: feed_dict = { self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.actions: actions, self.local_AC.advantages: policy_target } if summaries: l, v_l, p_l, e_l, g_n, v_n, _, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r = self.sess.run( [ self.local_AC.loss, self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary, self.local_AC.image_summaries, self.local_AC.max_value, self.local_AC.min_value, self.local_AC.mean_value, self.local_AC.max_reward, self.local_AC.min_reward, self.local_AC.mean_reward ], feed_dict=feed_dict) return l / len(rollout), v_l / len(rollout), p_l / len( rollout ), e_l / len( rollout ), g_n, v_n, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r else: _ = self.sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def train(self): start_time = time.time() self.episodes = self.env.generate_episodes(config.NUM_EPISODES, self) # Computing returns and estimating advantage function. for episode in self.episodes: episode["baseline"] = self.value_func.predict(episode) episode["returns"] = utils.discount(episode["rewards"], config.GAMMA) episode["advantage"] = episode["returns"] - episode["baseline"] # Updating policy. actions_dist_n = np.concatenate( [episode["actions_dist"] for episode in self.episodes]) states_n = np.concatenate( [episode["states"] for episode in self.episodes]) actions_n = np.concatenate( [episode["actions"] for episode in self.episodes]) baseline_n = np.concatenate( [episode["baseline"] for episode in self.episodes]) returns_n = np.concatenate( [episode["returns"] for episode in self.episodes]) # Standardize the advantage function to have mean=0 and std=1. advantage_n = np.concatenate( [episode["advantage"] for episode in self.episodes]) advantage_n -= advantage_n.mean() advantage_n /= (advantage_n.std() + 1e-8) # Computing baseline function for next iter. print(states_n.shape, actions_n.shape, advantage_n.shape, actions_dist_n.shape) feed = { self.policy.state: states_n, self.action: actions_n, self.advantage: advantage_n, self.policy.pi_theta_old: actions_dist_n } episoderewards = np.array( [episode["rewards"].sum() for episode in self.episodes]) #print("\n********** Iteration %i ************" % i) self.value_func.fit(self.episodes) self.theta_old = self.current_theta() def fisher_vector_product(p): feed[self.flat_tangent] = p return self.session.run(self.fisher_vect_prod, feed) + config.CG_DAMP * p self.g = self.session.run(self.surr_loss_grad, feed_dict=feed) self.grad_step = utils.conjugate_gradient(fisher_vector_product, -self.g) self.sAs = .5 * self.grad_step.dot( fisher_vector_product(self.grad_step)) self.beta_inv = np.sqrt(self.sAs / config.MAX_KL) self.full_grad_step = self.grad_step / self.beta_inv self.negdot_grad_step = -self.g.dot(self.grad_step) def loss(th): self.set_theta(th) return self.session.run(self.surr_loss, feed_dict=feed) self.theta = utils.line_search(loss, self.theta_old, self.full_grad_step, self.negdot_grad_step / self.beta_inv) self.set_theta(self.theta) surr_loss_new = -self.session.run(self.surr_loss, feed_dict=feed) KL_old_new = self.session.run(self.KL, feed_dict=feed) entropy = self.session.run(self.entropy, feed_dict=feed) old_new_norm = np.sum((self.theta - self.theta_old)**2) if np.abs(KL_old_new) > 2.0 * config.MAX_KL: print("Keeping old theta") self.set_theta(self.theta_old) stats = {} stats["L2 of old - new"] = old_new_norm stats["Total number of episodes"] = len(self.episodes) stats["Average sum of rewards per episode"] = episoderewards.mean() stats["Entropy"] = entropy exp = utils.explained_variance(np.array(baseline_n), np.array(returns_n)) stats["Baseline explained"] = exp stats["Time elapsed"] = "%.2f mins" % ( (time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = KL_old_new stats["Surrogate loss"] = surr_loss_new self.stats.append(stats) utils.write_dict(stats) save_path = self.saver.save(self.session, "./checkpoints/model.ckpt") print('Saved checkpoint to %s' % save_path) for k, v in stats.items(): print(k + ": " + " " * (40 - len(k)) + str(v))
def update(self, paths): self.time_step += 1 acts = np.concatenate([path["action"] for path in paths]) obs_scan = np.concatenate([path["obs_scan"] for path in paths]) obs_goal = np.concatenate([path["obs_goal"] for path in paths]) obs_vel = np.concatenate([path["obs_vel"] for path in paths]) obs_image = np.concatenate([path["obs_image"] for path in paths]) baseline_value = self.baseline.predict( [obs_scan, obs_goal, obs_vel, obs_image]) last_path_size = 0 for _, path in enumerate(paths): np.array(path["reward"]) path["return"] = discount(path["reward"], self.args.gamma) b = path["baseline"] = baseline_value[ last_path_size:last_path_size + path["done_id"]] b1 = np.append(b, 0 if path["terminated"] else b[-1]) deltas = path["reward"] + self.args.gamma * b1[1:] - b1[:-1] path["advantage"] = discount(deltas, self.args.gamma * self.args.lamda) last_path_size = path["done_id"] rets = np.concatenate([path["return"] for path in paths]) advs = np.concatenate([path["advantage"] for path in paths]) advs = (advs - advs.mean()) / (advs.std() + 1e-6) if self.time_step > 1: # train acotr after trained critic kl = self.actor_update(obs_scan, obs_image, obs_goal, obs_vel, acts, advs) self.critic_update(obs_scan, obs_image, obs_goal, obs_vel, rets) stats = OrderedDict() epRewards = np.array([path["reward"].sum() for path in paths]) epPathLengths = np.array([len(path["reward"]) for path in paths]) stats["EpRewardsMean"] = epRewards.mean() stats["EpRewardsMax"] = epRewards.max() stats["EpRewardsMin"] = epRewards.min() stats["EpPathLengthsMean"] = epPathLengths.mean() stats["EpPathLengthsMax"] = epPathLengths.max() stats["EpPathLengthsMin"] = epPathLengths.min() stats["RewardPerStep"] = epRewards.sum() / epPathLengths.sum() if self.time_step > 1: stats["Beta"] = self.beta stats["ActorLearningRate"] = self.actor_lr * self.lr_multiplier stats["KL-Divergence"] = kl feed_dict = { self.obs_scan: obs_scan, self.obs_goal: obs_goal, self.obs_vel: obs_vel, self.obs_image: obs_image, self.obs_scan_value: obs_scan, self.obs_image_value: obs_image, self.obs_goal_value: obs_goal, self.obs_vel_value: obs_vel, self.act_ph: acts, self.advantages_ph: advs, self.beta_ph: self.beta, self.eta_ph: self.eta, self.lr_ph: self.actor_lr * self.lr_multiplier, self.ret_ph: rets, self.visual_kl: kl, self.visual_reward: epRewards.mean() } summary = self.session.run(self.merge_all, feed_dict) self.writer.add_summary(summary, self.time_step) if epRewards.mean() > self.best_score: self.actor.save_network('best') self.baseline.save_network('best') self.best_score = epRewards.mean() self.actor.save_network('last') self.baseline.save_network('last') return stats
def train(self, rollout, sess, bootstrap_value_w, bootstrap_value_m, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] timesteps = rollout[:, 3] w_values = rollout[:, 5] m_values = rollout[:, 6] sum_of_prev_goals = rollout[:, 7] intr_rewards = rollout[:, 8] goals = rollout[:, 9] # if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() prev_goals = [np.random.normal(size=(FLAGS.hidden_dim, )) ] + goals[:-1].tolist() # The advantage function uses "Generalized Advantage Estimation" rewards_plus_w = np.asarray(rewards.tolist() + [bootstrap_value_w]) rewards_plus_m = np.asarray(rewards.tolist() + [bootstrap_value_m]) intr_rewards_plus = np.asarray(intr_rewards.tolist() + [bootstrap_value_w]) w_discounted_rewards = discount(rewards_plus_w, FLAGS.w_gamma)[:-1] m_discounted_rewards = discount(rewards_plus_m, FLAGS.m_gamma)[:-1] w_discounted_intr_rewards = discount(intr_rewards_plus, FLAGS.w_gamma)[:-1] # w_value_plus = np.asarray(w_values.tolist() + [bootstrap_value]) # m_value_plus = np.asarray(m_values.tolist() + [bootstrap_value]) w_rnn_state = self.local_AC.w_state_init m_rnn_state = self.local_AC.m_state_init feed_dict = { self.local_AC.w_extrinsic_return: w_discounted_rewards, self.local_AC.m_extrinsic_return: m_discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.prev_rewards: prev_rewards, self.local_AC.prev_actions: prev_actions, self.local_AC.prev_goal: prev_goals, self.local_AC.sum_prev_goals: np.stack(sum_of_prev_goals, axis=0), self.local_AC.w_intrinsic_return: w_discounted_intr_rewards, self.local_AC.actions: actions, self.local_AC.w_state_in[0]: w_rnn_state[0], self.local_AC.w_state_in[1]: w_rnn_state[1], self.local_AC.m_state_in[0]: m_rnn_state[0], self.local_AC.m_state_in[1]: m_rnn_state[1] } if summaries: l, w_v_l, m_v_l, p_l, g_l, e_l, g_n, v_n, _, ms, img_summ, cos_sim_state_diff = sess.run( [ self.local_AC.loss, self.local_AC.w_value_loss, self.local_AC.m_value_loss, self.local_AC.w_policy_loss, self.local_AC.goals_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary, self.local_AC.image_summaries, self.local_AC.cos_sim_state_diff, ], feed_dict=feed_dict) return l / len(rollout), w_v_l / len(rollout), m_v_l / len(rollout), \ p_l / len(rollout), g_l / len(rollout), \ e_l / len( rollout), g_n, v_n, ms, img_summ, m_discounted_rewards, w_discounted_rewards, w_discounted_intr_rewards, cos_sim_state_diff else: _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def learn(self, paths): # is it possible to replace A(s,a) with Q(s,a)? for path in paths: path["baseline"] = self.vf.predict(path) path["returns"] = utils.discount(path["rewards"], self.args.gamma) path["advantage"] = path["returns"] - path["baseline"] # path["advantage"] = path["returns"] # puts all the experiences in a matrix: total_timesteps x options action_dist_mu = np.concatenate( [path["action_dists_mu"] for path in paths]) action_dist_logstd = np.concatenate( [path["action_dists_logstd"] for path in paths]) obs_n = np.concatenate([path["obs"] for path in paths]) action_n = np.concatenate([path["actions"] for path in paths]) # standardize to mean 0 stddev 1 advant_n = np.concatenate([path["advantage"] for path in paths]) advant_n -= advant_n.mean() advant_n /= (advant_n.std() + 1e-8) # train value function / baseline on rollout paths self.vf.fit(paths) feed_dict = { self.obs: obs_n, self.action: action_n, self.advantage: advant_n, self.oldaction_dist_mu: action_dist_mu, self.oldaction_dist_logstd: action_dist_logstd } # parameters thprev = self.gf() # computes fisher vector product: F * [self.pg] def fisher_vector_product(p): feed_dict[self.flat_tangent] = p return self.session.run(self.fvp, feed_dict) + p * self.args.cg_damping g = self.session.run(self.pg, feed_dict) # solve Ax = g, where A is Fisher information metrix and g is gradient of parameters # stepdir = A_inverse * g = x stepdir = utils.conjugate_gradient(fisher_vector_product, -g) # let stepdir = change in theta / direction that theta changes in # KL divergence approximated by 0.5 x stepdir_transpose * [Fisher Information Matrix] * stepdir # where the [Fisher Information Matrix] acts like a metric # ([Fisher Information Matrix] * stepdir) is computed using the function, # and then stepdir * [above] is computed manually. shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / self.args.max_kl) # if self.args.max_kl > 0.001: # self.args.max_kl *= self.args.kl_anneal fullstep = stepdir / lm negative_g_dot_steppdir = -g.dot(stepdir) def loss(th): self.sff(th) # surrogate loss: policy gradient loss return self.session.run(self.losses[0], feed_dict) # finds best parameter by starting with a big step and working backwards theta = utils.linesearch(loss, thprev, fullstep, negative_g_dot_steppdir / lm) # i guess we just take a fullstep no matter what theta = thprev + fullstep self.sff(theta) surrogate_after, kl_after, entropy_after = self.session.run( self.losses, feed_dict) episoderewards = np.array([path["rewards"].sum() for path in paths]) stats = {} stats["Average sum of rewards per episode"] = episoderewards.mean() stats["Entropy"] = entropy_after stats["max KL"] = self.args.max_kl stats["Timesteps"] = sum([len(path["rewards"]) for path in paths]) # stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = kl_after stats["Surrogate loss"] = surrogate_after # print(("\n********** Iteration {} ************".format(i))) for k, v in stats.items(): print(k + ": " + " " * (40 - len(k)) + str(v)) return stats["Average sum of rewards per episode"]
def _loop_listening(self): """ Ensures that the program will continue listening until closure """ episode = 1 collection_length = self.batch_size * self.sequence_length while (self.listening): # Record the keys and game frames while recording is enabled while (self.playing): while (episode <= self.episodes and self.playing): state = self.sr_game.reset() terminal = False while (not terminal and self.playing): states = [] actions = [] rewards = [] values = [] while (len(states) < collection_length and self.playing and not terminal): start = time() state = self.sr_game.state tens_state = torch.FloatTensor([state]).to( self.model.device) tens_state = (tens_state / 255.0).permute( 0, 3, 1, 2) action, policy, value, rnd = self.model.step( tens_state) next_state, reward, terminal = self.sr_game.step( action) reward = reward + rnd states.append(state) actions.append(action) rewards.append(reward) values.append(value) #print("Loop time:", time() - start) if (len(states) == collection_length): states = (np.stack(states) / 255.0).astype( np.float32) actions = np.array(actions, dtype=np.float32) rewards = np.array(rewards, dtype=np.float32) values = np.array(values, dtype=np.float32) returns = discount(rewards, decay) advantages = returns - values advantages = normalize(advantages, 1e-5).astype(np.float32) loss = self.model.train_reinforce( [states, actions, rewards, advantages]) print("Loss:", loss) """ Just training RND for now supervised = self.data_handler.sequenced_sample( self.batch_size, self.sequence_length, str(self.model.device) == "cuda" ) supervised = [tens.view(-1, *tens.shape[2:]) for tens in supervised] self.model.train_supervised(*supervised) """ if (episode % self.save_interval == 0): self.model.save(self.save_path) if (episode == self.episodes): self.stop()
def train(self, rollout, sess, bootstrap_value_w, bootstrap_value_m, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] timesteps = rollout[:, 3] w_values = rollout[:, 5] m_values = rollout[:, 6] sum_of_prev_goals = rollout[:, 7] intr_rewards = rollout[:, 8] goals = rollout[:, 9] # if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() prev_goals = [np.random.normal(size=(FLAGS.hidden_dim,))] + goals[:-1].tolist() # The advantage function uses "Generalized Advantage Estimation" rewards_plus_w = np.asarray(rewards.tolist() + [bootstrap_value_w]) rewards_plus_m = np.asarray(rewards.tolist() + [bootstrap_value_m]) intr_rewards_plus = np.asarray(intr_rewards.tolist() + [bootstrap_value_w]) w_discounted_rewards = discount(rewards_plus_w, FLAGS.w_gamma)[:-1] m_discounted_rewards = discount(rewards_plus_m, FLAGS.m_gamma)[:-1] w_discounted_intr_rewards = discount(intr_rewards_plus, FLAGS.w_gamma)[:-1] # w_value_plus = np.asarray(w_values.tolist() + [bootstrap_value]) # m_value_plus = np.asarray(m_values.tolist() + [bootstrap_value]) w_rnn_state = self.local_AC.w_state_init m_rnn_state = self.local_AC.m_state_init feed_dict = {self.local_AC.w_extrinsic_return: w_discounted_rewards, self.local_AC.m_extrinsic_return: m_discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.prev_rewards: prev_rewards, self.local_AC.prev_actions: prev_actions, self.local_AC.prev_goal: prev_goals, self.local_AC.sum_prev_goals: np.stack(sum_of_prev_goals, axis=0), self.local_AC.w_intrinsic_return: w_discounted_intr_rewards, self.local_AC.actions: actions, self.local_AC.w_state_in[0]: w_rnn_state[0], self.local_AC.w_state_in[1]: w_rnn_state[1], self.local_AC.m_state_in[0]: m_rnn_state[0], self.local_AC.m_state_in[1]: m_rnn_state[1] } if summaries: l, w_v_l, m_v_l, p_l, g_l, e_l, g_n, v_n, _, ms, img_summ, cos_sim_state_diff = sess.run( [self.local_AC.loss, self.local_AC.w_value_loss, self.local_AC.m_value_loss, self.local_AC.w_policy_loss, self.local_AC.goals_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary, self.local_AC.image_summaries, self.local_AC.cos_sim_state_diff, ], feed_dict=feed_dict) return l / len(rollout), w_v_l / len(rollout), m_v_l / len(rollout), \ p_l / len(rollout), g_l / len(rollout), \ e_l / len( rollout), g_n, v_n, ms, img_summ, m_discounted_rewards, w_discounted_rewards, w_discounted_intr_rewards, cos_sim_state_diff else: _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def run_vanilla_policy_gradient_experiment(args, vf_params, logdir, env, sess, continuous_control): """ General purpose method to run vanilla policy gradients. Works for both continuous and discrete environments. Roughly inspired by starter code for this homework and https://github.com/DanielTakeshi/rl_algorithms/blob/master/vpg/main.py Thanks! Params ------ args: arguments for vanilla policy gradient. vf_params: dict of params for value function logdir: where to store outputs or None if you don't want to store anything env: openai gym env sess: TF session continuous_control: boolean, if true then we do gaussian continuous control """ ob_dim = env.observation_space.shape[0] if args.vf_type == 'linear': value_function = LinearValueFunction(**vf_params) elif args.vf_type == 'nn': value_function = NnValueFunction(session=sess, ob_dim=ob_dim) #value_function = LinearValueFunction() if continuous_control: ac_dim = env.action_space.shape[0] policy_fn = policies.GaussianPolicy(sess, ob_dim, ac_dim) else: ac_dim = env.action_space.n policy_fn = policies.DisceretePolicy(sess, ob_dim, ac_dim) sess.__enter__() # equivalent to with sess, to reduce indentation tf.global_variables_initializer().run() total_timesteps = 0 stepsize = args.initial_stepsize filterAction = 0.1 stepMax = 100 for i in range(args.n_iter): print("\n********** Iteration %i ************" % i) # Collect paths until we have enough timesteps. timesteps_this_batch = 0 paths = [] step = 0 #if(filterAction > 1.0): # filterAction = 1.0 #else: # filterAction = filterAction*1.1 while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = ( len(paths) == 0 and (i % 10 == 0) and args.render) while True: if animate_this_episode: env.render() obs.append(ob) ac = policy_fn.sample_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) step = step + 1 if done: step = 0 #print "done " break #if done or step > stepMax: # print "max steps: {}".format(stepMax) # step = 0 # stepMax = stepMax + 2 # break path = {"observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs)} paths.append(path) timesteps_this_batch += utils.pathlength(path) if timesteps_this_batch > args.min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function using baseline vf (these are lists!). # return_t: list of sum of discounted rewards (to end of # episode), one per time # vpred_t: list of value function's predictions of components of # return_t vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = utils.discount(rew_t, args.gamma) vpred_t = value_function.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update and **re-fit the baseline**. ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) value_function.fit(ob_no, vtarg_n) # Policy update, plus diagnostics stuff. Is there a better way to # handle # the continuous vs discrete control cases? if continuous_control: surr_loss, oldmean_na, oldlogstd_a = policy_fn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policy_fn.kldiv_and_entropy( ob_no, oldmean_na, oldlogstd_a ) else: surr_loss, oldlogits_na = policy_fn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policy_fn.kldiv_and_entropy(ob_no, oldlogits_na) # Step size heuristic to ensure that we don't take too large steps. if args.use_kl_heuristic: if kl > args.desired_kl * 2: stepsize /= 1.5 print('PG stepsize -> %s' % stepsize) elif kl < args.desired_kl / 2: stepsize *= 1.5 print('PG stepsize -> %s' % stepsize) else: print('PG stepsize OK') # Log diagnostics if i % args.log_every_t_iter == 0: logz.log_tabular("EpRewMean", np.mean( [path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean( [utils.pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", utils.explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", utils.explained_variance_1d(value_function.predict(ob_no), vtarg_n)) logz.log_tabular("SurrogateLoss", surr_loss) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than # EVBefore. # Note that we fit the value function AFTER using it to # compute the # advantage function to avoid introducing bias logz.dump_tabular()
def train(self, rollout, bootstrap_value, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] if FLAGS.meta: prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [0] + actions[:-1].tolist() values = rollout[:, 5] # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, FLAGS.gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] if FLAGS.gen_adv: td_residuals = rewards + FLAGS.gamma * value_plus[1:] - value_plus[:-1] advantages = discount(td_residuals, FLAGS.gamma) policy_target = advantages if FLAGS.lstm: if FLAGS.meta: rnn_state = self.local_AC.state_init feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.prev_actions: prev_actions, self.local_AC.actions: actions, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]} else: rnn_state = self.local_AC.state_init feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.actions: actions, self.local_AC.advantages: policy_target, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1]} else: feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.stack(observations, axis=0), self.local_AC.actions: actions, self.local_AC.advantages: policy_target} if summaries: l, v_l, p_l, e_l, g_n, v_n, _, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r = self.sess.run( [self.local_AC.loss, self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads, self.local_AC.merged_summary, self.local_AC.image_summaries, self.local_AC.max_value, self.local_AC.min_value, self.local_AC.mean_value, self.local_AC.max_reward, self.local_AC.min_reward, self.local_AC.mean_reward], feed_dict=feed_dict) return l / len(rollout), v_l / len(rollout), p_l / len(rollout), e_l / len( rollout), g_n, v_n, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r else: _ = self.sess.run([self.local_AC.apply_grads], feed_dict=feed_dict) return None
def work(self): hooks = [self.ppo.sync_replicas_hook] sess = tf.train.MonitoredTrainingSession(master=self.server.target, is_chief=(self.wid == 0), checkpoint_dir=SUMMARY_DIR, save_summaries_steps=None, save_summaries_secs=None, hooks=hooks) if self.wid == 0: writer = SummaryWriterCache.get(SUMMARY_DIR) t, episode, terminal = 0, 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() while not sess.should_stop() and not (episode > EP_MAX and self.wid == 0): s = self.env.reset() ep_r, ep_t, ep_a = 0, 0, [] while True: a, v = self.ppo.evaluate_state(s, sess) # Update ppo if t == BATCH: # or (terminal and t < BATCH): # Normalise rewards rewards = np.array(buffer_r) rolling_r.update(rewards) rewards = np.clip(rewards / rolling_r.std, -10, 10) v_final = [ v * (1 - terminal) ] # v = 0 if terminal, otherwise use the predicted v values = np.array(buffer_v + v_final) terminals = np.array(buffer_terminal + [terminal]) # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438 delta = rewards + GAMMA * values[1:] * ( 1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) returns = advantage + np.array(buffer_v) advantage = (advantage - advantage.mean()) / np.maximum( advantage.std(), 1e-6) bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) graph_summary = self.ppo.update(bs, ba, br, badv, sess) buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] t = 0 buffer_s.append(s) buffer_a.append(a) buffer_v.append(v) buffer_terminal.append(terminal) ep_a.append(a) if not self.ppo.discrete: a = np.clip(a, self.env.action_space.low, self.env.action_space.high) s, r, terminal, _ = self.env.step(a) buffer_r.append(r) ep_r += r ep_t += 1 t += 1 if terminal: # End of episode summary print('Worker_%i' % self.wid, '| Episode: %i' % episode, "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t) if self.wid == 0: worker_summary = tf.Summary() worker_summary.value.add(tag="Reward", simple_value=ep_r) # Create Action histograms for each dimension actions = np.array(ep_a) if self.ppo.discrete: add_histogram(writer, "Action", actions, episode, bins=self.ppo.a_dim) else: for a in range(self.ppo.a_dim): add_histogram(writer, "Action/Dim" + str(a), actions[:, a], episode) try: writer.add_summary(graph_summary, episode) except NameError: pass writer.add_summary(worker_summary, episode) writer.flush() episode += 1 break self.env.close() print("Worker_%i finished" % self.wid)
def update_model(self, shared_data): """ This function accepts the data collected from a rollout and performs Q value update iterations on the neural net. shared_data - dict of torch tensors with shared memory to collect data. Each tensor contains indices from idx*n_tsteps to (idx+1)*n_tsteps Keys (assume string keys): "states" - MDP states at each timestep t type: FloatTensor shape: (n_states, *state_shape) "deltas" - gae deltas collected at timestep t+1 type: FloatTensor shape: (n_states,) "h_states" - Recurrent states at timestep t+1 type: FloatTensor shape: (n_states, h_size) "rewards" - Collects float rewards collected at each timestep t type: FloatTensor shape: (n_states,) "dones" - Collects the dones collected at each timestep t type: FloatTensor shape: (n_states,) "actions" - Collects actions performed at each timestep t type: LongTensor shape: (n_states,) """ hyps = self.hyps net = self.net net.req_grads(True) states = shared_data['states'] rewards = shared_data['rewards'] dones = shared_data['dones'] actions = shared_data['actions'] deltas = shared_data['deltas'] advs = cuda_if( discount(deltas.squeeze(), dones.squeeze(), hyps['gamma'] * hyps['lambda_'])) # Forward Pass if 'h_states' in shared_data: h_states = Variable(cuda_if(shared_data['h_states'])) if hyps['use_bptt']: vals, logits = self.bptt(states, h_states, dones) else: vals, logits, _ = net(Variable(cuda_if(states)), h_states) else: vals, logits = net(Variable(cuda_if(states))) # Log Probabilities log_softs = F.log_softmax(logits, dim=-1) logprobs = log_softs[torch.arange(len(actions)).long(), actions] # Returns if hyps['use_nstep_rets']: returns = advs + vals.data.squeeze() else: returns = cuda_if( discount(rewards.squeeze(), dones.squeeze(), hyps['gamma'])) # Advantages if hyps['norm_advs']: advs = (advs - advs.mean()) / (advs.std() + 1e-6) # A2C Losses pi_loss = -(logprobs.squeeze() * Variable(advs.squeeze())).mean() val_loss = hyps['val_coef'] * F.mse_loss(vals.squeeze(), returns) entr_loss = -hyps['entr_coef'] * ( (log_softs * F.softmax(logits, dim=-1)).sum(-1)).mean() loss = pi_loss + val_loss - entr_loss loss.backward() self.norm = nn.utils.clip_grad_norm_(net.parameters(), hyps['max_norm']) self.optim.step() self.optim.zero_grad() self.info = { "Loss": loss.item(), "Pi_Loss": pi_loss.item(), "ValLoss": val_loss.item(), "Entropy": entr_loss.item(), "GradNorm": self.norm.item() } return self.info
if not done: rewards.append(model.value(obs).item()) else: rewards.append(0.0) # RESHAPING states = torch.stack(states) actions = torch.stack(actions) logprobs = torch.stack(logprobs).detach() rewards = torch.tensor(rewards, dtype=torch.float, device=opts["device"]) with torch.no_grad(): values = model.value(states).reshape(-1) if done: values[-1] = 0.0 advantages = rewards[:-1] + opts["gamma"] * values[1:] - values[:-1] discounted_adv = utils.discount(advantages, opts["gamma"] * opts["lambda"]) cumrew = rewards[:-1].sum().item() rewards = utils.discount(rewards, gamma=opts["gamma"])[:-1] temp_history.append(cumrew) print("Episode: %d, reward: %.3f, std: %.3f, %.3f" % (it, cumrew, *torch.exp(model.log_std).detach())) # ADD TO MEMORY for i in range(states.shape[0] - 1): model.record(states[i], actions[i], logprobs[i], rewards[i], discounted_adv[i]) # UPDATE if model.memory.size >= opts["update_iter"]: loss = model.update()