예제 #1
0
        with torch.no_grad():
            actor_states = posterior_states.detach()
            actor_beliefs = beliefs.detach()
        with FreezeParameters(model_modules):
            imagination_traj = imagine_ahead(actor_states, actor_beliefs,
                                             actor_model, transition_model,
                                             args.planning_horizon)
        imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj
        with FreezeParameters(model_modules + value_model.modules):
            imged_reward = bottle(reward_model,
                                  (imged_beliefs, imged_prior_states))
            value_pred = bottle(value_model,
                                (imged_beliefs, imged_prior_states))
        returns = lambda_return(imged_reward,
                                value_pred,
                                bootstrap=value_pred[-1],
                                discount=args.discount,
                                lambda_=args.disclam)
        actor_loss = -torch.mean(returns)
        # Update model parameters
        actor_optimizer.zero_grad()
        actor_loss.backward()
        nn.utils.clip_grad_norm_(actor_model.parameters(),
                                 args.grad_clip_norm,
                                 norm_type=2)
        actor_optimizer.step()

        #Dreamer implementation: value loss calculation and optimization
        with torch.no_grad():
            value_beliefs = imged_beliefs.detach()
            value_prior_states = imged_prior_states.detach()
예제 #2
0
    def run(self) -> None:
        # print("children process {} waiting to get data".format(self.process_id))
        # Run = self.child_conn.recv()
        # print("children process {} Geted data form parent".format(self.process_id))
        actor_loss, value_loss = None, None
        while self.child_conn.recv() == 1:
            # print("Start Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id))
            for _ in range(args.sub_traintime):
                with FreezeParameters(self.env_model_modules):
                    actor_states = torch.load(
                        os.path.join(os.getcwd(),
                                     self.results_dir + '/actor_states.pt'))
                    actor_beliefs = torch.load(
                        os.path.join(os.getcwd(),
                                     self.results_dir + '/actor_beliefs.pt'))
                    actor_states = actor_states.cuda(
                    ) if torch.cuda.is_available(
                    ) and not args.disable_cuda else actor_states.cpu()
                    actor_beliefs = actor_beliefs.cuda(
                    ) if torch.cuda.is_available(
                    ) and not args.disable_cuda else actor_beliefs.cpu()

                    imagination_traj = imagine_ahead(
                        actor_states,
                        actor_beliefs,
                        self.actor_l,
                        self.transition_model,
                        args.planning_horizon,
                        action_scale=self.process_id)

                imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

                # Update model parameters
                with FreezeParameters(self.env_model_modules +
                                      self.value_model_l_modules):
                    imged_reward = bottle(self.reward_model,
                                          (imged_beliefs, imged_prior_states))
                    value_pred = bottle(self.value_l,
                                        (imged_beliefs, imged_prior_states))

                returns = lambda_return(imged_reward,
                                        value_pred,
                                        bootstrap=value_pred[-1],
                                        discount=args.discount,
                                        lambda_=args.disclam)
                actor_loss = -torch.mean(returns)

                # calculate local gradients and push local parameters to global
                self.actor_optimizer_l.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor_l.parameters(),
                                         args.grad_clip_norm,
                                         norm_type=2)
                # for la, ga in zip(self.actor_l.parameters(), self.actor_g.parameters()):
                #     ga._grad = la.grad
                self.actor_optimizer_l.step()

                # push global parameters
                # self.actor_l.load_state_dict(self.actor_g.state_dict())

                # Dreamer implementation: value loss calculation and optimization
                with torch.no_grad():
                    value_beliefs = imged_beliefs.detach()
                    value_prior_states = imged_prior_states.detach()
                    target_return = returns.detach()

                value_dist = Normal(
                    bottle(self.value_l, (value_beliefs, value_prior_states)),
                    1)  # detach the input tensor from the transition network.
                value_loss = -value_dist.log_prob(target_return).mean(dim=(0,
                                                                           1))
                # Update model parameters
                self.value_optimizer_l.zero_grad()
                value_loss.backward()
                nn.utils.clip_grad_norm_(self.value_l.parameters(),
                                         args.grad_clip_norm,
                                         norm_type=2)
                self.value_optimizer_l.step()

            # save the loss data
            self.losses.append([actor_loss.item(), value_loss.item()])
            if self.count == args.collect_interval - 1:
                losses = tuple(zip(*self.losses))
                self.metrics['actor_loss'].append(losses[0])
                self.metrics['value_loss'].append(losses[1])
                Save_Txt(self.metrics['episodes'][-1],
                         self.metrics['actor_loss'][-1],
                         'actor_loss' + str(self.process_id), self.results_dir)
                Save_Txt(self.metrics['episodes'][-1],
                         self.metrics['value_loss'][-1],
                         'value_loss' + str(self.process_id), self.results_dir)
                self.count = 0
                self.losses = []
                self.metrics['episodes'].append(self.metrics['episodes'][-1] +
                                                1)
            self.count += 1

            # print("End Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id))

            self.child_conn.send(1)
예제 #3
0
    def train_algorithm(self, actor_states, actor_beliefs):

        [
            self.actor_pipes[i][0].send(1)
            for i, w in enumerate(self.workers_actor)
        ]  # Parent_pipe send data using i'th pipes
        [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)
         ]  # waitting the children finish

        with FreezeParameters(self.model_modules):
            imagination_traj = self.imagine_merge_ahead(
                prev_state=actor_states,
                prev_belief=actor_beliefs,
                policy_pool=self.actor_pool,
                transition_model=self.transition_model,
                merge_model=self.merge_actor_model)
        imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj

        with FreezeParameters(self.model_modules +
                              self.merge_value_model_modules):
            imged_reward = bottle(self.reward_model,
                                  (imged_beliefs, imged_prior_states))
            value_pred = bottle(self.merge_value_model,
                                (imged_beliefs, imged_prior_states))

        with FreezeParameters(self.actor_pool_modules):
            returns = lambda_return(imged_reward,
                                    value_pred,
                                    bootstrap=value_pred[-1],
                                    discount=args.discount,
                                    lambda_=args.disclam)
            merge_actor_loss = -torch.mean(returns)
            # Update model parameters
            self.merge_actor_optimizer.zero_grad()
            merge_actor_loss.backward()
            nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(),
                                     args.grad_clip_norm,
                                     norm_type=2)
            self.merge_actor_optimizer.step()

        # Dreamer implementation: value loss calculation and optimization
        with torch.no_grad():
            value_beliefs = imged_beliefs.detach()
            value_prior_states = imged_prior_states.detach()
            target_return = returns.detach()

        value_dist = Normal(
            bottle(self.merge_value_model,
                   (value_beliefs, value_prior_states)),
            1)  # detach the input tensor from the transition network.
        merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1))
        # Update model parameters
        self.merge_value_optimizer.zero_grad()
        merge_value_loss.backward()
        nn.utils.clip_grad_norm_(self.merge_value_model.parameters(),
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.merge_value_optimizer.step()

        self.merge_losses.append(
            [merge_actor_loss.item(),
             merge_value_loss.item()])
예제 #4
0
def train(agent,
          env,
          sess,
          worker_id,
          k_steps=20,
          DISCOUNT=0.99,
          step_limit=5000000,
          verbose_every=50,
          net_saver=None,
          TB_DIR=None):
    print("Starting Agent", worker_id)
    rewardlist = []
    runningreward = 0
    bestreward = 0
    RETURN_STEPS = k_steps
    b_states = [None]
    step = 0
    done = True
    write_summary = False
    if worker_id == 0:
        if TB_DIR != None:
            summary_writer = tf.summary.FileWriter(TB_DIR + "/tb",
                                                   sess.graph,
                                                   flush_secs=30)
            write_summary = True
    while step < step_limit:
        b_states, b_actions, b_rewards = [b_states[-1]], [], []
        if done:
            agent.update_target()
            b_states = [env.reset()]
            done = False
            runningreward = 0.9 * runningreward + 0.1 * np.sum(rewardlist)
            bestreward = np.maximum(bestreward, np.sum(rewardlist))
            rewardlist = []
        while not done and len(b_states) <= RETURN_STEPS:
            action = agent.get_action(b_states[-1])
            state, reward, done, _ = env.step(action)
            rewardlist.append(reward)
            b_actions.append(action)
            b_states.append(state)
            b_rewards.append(reward)

        b_values = agent.get_values(b_states)
        b_targets = lambda_return(b_rewards, b_values, done, DISCOUNT, 1)
        b_values = np.reshape(b_values[:-1], [-1])
        b_advantages = np.subtract(b_targets, b_values)
        if step < 5000:
            b_advantages = np.zeros_like(
                b_advantages
            )  # idea to pretrain value function to stop early divergence due to strong adv->grads
        b_ppo_pi = agent.GlobalNet.get_ppo_pi_for_actions(
            b_states[:-1], b_actions)
        agent.update_ppo()
        summary, step = agent.update_step(b_states[:-1], b_actions, b_targets,
                                          b_advantages, b_ppo_pi,
                                          write_summary)
        if step % verbose_every == 0:
            print("Worker ", worker_id, "At ", step, " Running/Max: ",
                  runningreward, bestreward)

        if step % 2500 == 0:
            print("Saving Model")
            net_saver.save(sess,
                           TB_DIR + "checkpoints/model" + str(step) + ".cptk")
        if step % 1000 == 0:
            print(agent.get_pi(b_states[-1]), b_values[-1])
        if write_summary:
            summary_writer.add_summary(summary, step)  #