with torch.no_grad(): actor_states = posterior_states.detach() actor_beliefs = beliefs.detach() with FreezeParameters(model_modules): imagination_traj = imagine_ahead(actor_states, actor_beliefs, actor_model, transition_model, args.planning_horizon) imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj with FreezeParameters(model_modules + value_model.modules): imged_reward = bottle(reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(value_model, (imged_beliefs, imged_prior_states)) returns = lambda_return(imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam) actor_loss = -torch.mean(returns) # Update model parameters actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(actor_model.parameters(), args.grad_clip_norm, norm_type=2) actor_optimizer.step() #Dreamer implementation: value loss calculation and optimization with torch.no_grad(): value_beliefs = imged_beliefs.detach() value_prior_states = imged_prior_states.detach()
def run(self) -> None: # print("children process {} waiting to get data".format(self.process_id)) # Run = self.child_conn.recv() # print("children process {} Geted data form parent".format(self.process_id)) actor_loss, value_loss = None, None while self.child_conn.recv() == 1: # print("Start Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id)) for _ in range(args.sub_traintime): with FreezeParameters(self.env_model_modules): actor_states = torch.load( os.path.join(os.getcwd(), self.results_dir + '/actor_states.pt')) actor_beliefs = torch.load( os.path.join(os.getcwd(), self.results_dir + '/actor_beliefs.pt')) actor_states = actor_states.cuda( ) if torch.cuda.is_available( ) and not args.disable_cuda else actor_states.cpu() actor_beliefs = actor_beliefs.cuda( ) if torch.cuda.is_available( ) and not args.disable_cuda else actor_beliefs.cpu() imagination_traj = imagine_ahead( actor_states, actor_beliefs, self.actor_l, self.transition_model, args.planning_horizon, action_scale=self.process_id) imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj # Update model parameters with FreezeParameters(self.env_model_modules + self.value_model_l_modules): imged_reward = bottle(self.reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(self.value_l, (imged_beliefs, imged_prior_states)) returns = lambda_return(imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam) actor_loss = -torch.mean(returns) # calculate local gradients and push local parameters to global self.actor_optimizer_l.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor_l.parameters(), args.grad_clip_norm, norm_type=2) # for la, ga in zip(self.actor_l.parameters(), self.actor_g.parameters()): # ga._grad = la.grad self.actor_optimizer_l.step() # push global parameters # self.actor_l.load_state_dict(self.actor_g.state_dict()) # Dreamer implementation: value loss calculation and optimization with torch.no_grad(): value_beliefs = imged_beliefs.detach() value_prior_states = imged_prior_states.detach() target_return = returns.detach() value_dist = Normal( bottle(self.value_l, (value_beliefs, value_prior_states)), 1) # detach the input tensor from the transition network. value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1)) # Update model parameters self.value_optimizer_l.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.value_l.parameters(), args.grad_clip_norm, norm_type=2) self.value_optimizer_l.step() # save the loss data self.losses.append([actor_loss.item(), value_loss.item()]) if self.count == args.collect_interval - 1: losses = tuple(zip(*self.losses)) self.metrics['actor_loss'].append(losses[0]) self.metrics['value_loss'].append(losses[1]) Save_Txt(self.metrics['episodes'][-1], self.metrics['actor_loss'][-1], 'actor_loss' + str(self.process_id), self.results_dir) Save_Txt(self.metrics['episodes'][-1], self.metrics['value_loss'][-1], 'value_loss' + str(self.process_id), self.results_dir) self.count = 0 self.losses = [] self.metrics['episodes'].append(self.metrics['episodes'][-1] + 1) self.count += 1 # print("End Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id)) self.child_conn.send(1)
def train_algorithm(self, actor_states, actor_beliefs): [ self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor) ] # Parent_pipe send data using i'th pipes [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool) ] # waitting the children finish with FreezeParameters(self.model_modules): imagination_traj = self.imagine_merge_ahead( prev_state=actor_states, prev_belief=actor_beliefs, policy_pool=self.actor_pool, transition_model=self.transition_model, merge_model=self.merge_actor_model) imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj with FreezeParameters(self.model_modules + self.merge_value_model_modules): imged_reward = bottle(self.reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(self.merge_value_model, (imged_beliefs, imged_prior_states)) with FreezeParameters(self.actor_pool_modules): returns = lambda_return(imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam) merge_actor_loss = -torch.mean(returns) # Update model parameters self.merge_actor_optimizer.zero_grad() merge_actor_loss.backward() nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(), args.grad_clip_norm, norm_type=2) self.merge_actor_optimizer.step() # Dreamer implementation: value loss calculation and optimization with torch.no_grad(): value_beliefs = imged_beliefs.detach() value_prior_states = imged_prior_states.detach() target_return = returns.detach() value_dist = Normal( bottle(self.merge_value_model, (value_beliefs, value_prior_states)), 1) # detach the input tensor from the transition network. merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1)) # Update model parameters self.merge_value_optimizer.zero_grad() merge_value_loss.backward() nn.utils.clip_grad_norm_(self.merge_value_model.parameters(), args.grad_clip_norm, norm_type=2) self.merge_value_optimizer.step() self.merge_losses.append( [merge_actor_loss.item(), merge_value_loss.item()])
def train(agent, env, sess, worker_id, k_steps=20, DISCOUNT=0.99, step_limit=5000000, verbose_every=50, net_saver=None, TB_DIR=None): print("Starting Agent", worker_id) rewardlist = [] runningreward = 0 bestreward = 0 RETURN_STEPS = k_steps b_states = [None] step = 0 done = True write_summary = False if worker_id == 0: if TB_DIR != None: summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", sess.graph, flush_secs=30) write_summary = True while step < step_limit: b_states, b_actions, b_rewards = [b_states[-1]], [], [] if done: agent.update_target() b_states = [env.reset()] done = False runningreward = 0.9 * runningreward + 0.1 * np.sum(rewardlist) bestreward = np.maximum(bestreward, np.sum(rewardlist)) rewardlist = [] while not done and len(b_states) <= RETURN_STEPS: action = agent.get_action(b_states[-1]) state, reward, done, _ = env.step(action) rewardlist.append(reward) b_actions.append(action) b_states.append(state) b_rewards.append(reward) b_values = agent.get_values(b_states) b_targets = lambda_return(b_rewards, b_values, done, DISCOUNT, 1) b_values = np.reshape(b_values[:-1], [-1]) b_advantages = np.subtract(b_targets, b_values) if step < 5000: b_advantages = np.zeros_like( b_advantages ) # idea to pretrain value function to stop early divergence due to strong adv->grads b_ppo_pi = agent.GlobalNet.get_ppo_pi_for_actions( b_states[:-1], b_actions) agent.update_ppo() summary, step = agent.update_step(b_states[:-1], b_actions, b_targets, b_advantages, b_ppo_pi, write_summary) if step % verbose_every == 0: print("Worker ", worker_id, "At ", step, " Running/Max: ", runningreward, bestreward) if step % 2500 == 0: print("Saving Model") net_saver.save(sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk") if step % 1000 == 0: print(agent.get_pi(b_states[-1]), b_values[-1]) if write_summary: summary_writer.add_summary(summary, step) #