示例#1
0
    def get_action(self, ob, sample=True, *args, **kwargs):
        self.eval_mode()
        t_ob = {key: torch_float(ob[key], device=cfg.alg.device) for key in ob}
        act_dist_cont, act_dist_disc, val = self.get_act_val(t_ob)
        action_cont = action_from_dist(act_dist_cont, sample=sample)
        action_discrete = action_from_dist(act_dist_disc, sample=sample)
        #print('456', action_discrete.shape, act_dist_disc)
        #print('123', action_cont.shape, act_dist_cont)
        log_prob_disc = action_log_prob(action_discrete, act_dist_disc)
        log_prob_cont = action_log_prob(action_cont, act_dist_cont)
        entropy_disc = action_entropy(act_dist_disc, log_prob_disc)
        entropy_cont = action_entropy(act_dist_cont, log_prob_cont)
        #print("cont:", torch_to_np(log_prob_cont).reshape(-1, 1))
        log_prob = log_prob_cont + torch.sum(log_prob_disc, axis=1)
        #print(log_prob_cont.shape, log_prob_disc.shape)
        entropy = entropy_cont + torch.sum(entropy_disc, axis=1)

        action_info = dict(log_prob=torch_to_np(log_prob),
                           entropy=torch_to_np(entropy),
                           val=torch_to_np(val))
        #print("cd", action_cont.shape, action_discrete.shape)
        action = np.concatenate(
            (torch_to_np(action_cont), torch_to_np(action_discrete)), axis=1)
        #print("action:", action)

        return action, action_info
示例#2
0
 def update_q(self, obs, actions, next_obs, rewards, dones):
     q1 = self.q1((obs, actions))[0]
     q2 = self.q2((obs, actions))[0]
     with torch.no_grad():
         next_act_dist = self.actor(next_obs)[0]
         next_actions = action_from_dist(next_act_dist, sample=True)
         nlog_prob = action_log_prob(next_actions,
                                     next_act_dist).unsqueeze(-1)
         nq1_tgt_val = self.q1_tgt((next_obs, next_actions))[0]
         nq2_tgt_val = self.q2_tgt((next_obs, next_actions))[0]
         nq_tgt_val = torch.min(nq1_tgt_val,
                                nq2_tgt_val) - self.alpha * nlog_prob
         q_tgt_val = rewards + cfg.alg.rew_discount * (1 -
                                                       dones) * nq_tgt_val
     loss_q1 = F.mse_loss(q1, q_tgt_val)
     loss_q2 = F.mse_loss(q2, q_tgt_val)
     loss_q = loss_q1 + loss_q2
     self.q_optimizer.zero_grad()
     loss_q.backward()
     grad_norm = clip_grad(self.q_params, cfg.alg.max_grad_norm)
     self.q_optimizer.step()
     q_info = dict(
         q1_loss=loss_q1.item(),
         q2_loss=loss_q2.item(),
         vec_q1_val=torch_to_np(q1),
         vec_q2_val=torch_to_np(q2),
         vec_q_tgt_val=torch_to_np(q_tgt_val),
     )
     q_info['q_grad_norm'] = grad_norm
     return q_info
示例#3
0
 def get_action(self, ob, sample=True, *args, **kwargs):
     self.eval_mode()
     t_ob = torch_float(ob, device=cfg.alg.device)
     act_dist, val = self.get_act_val(t_ob)
     action = action_from_dist(act_dist, sample=sample)
     log_prob = action_log_prob(action, act_dist)
     entropy = action_entropy(act_dist, log_prob)
     action_info = dict(log_prob=torch_to_np(log_prob),
                        entropy=torch_to_np(entropy),
                        val=torch_to_np(val))
     return torch_to_np(action), action_info
示例#4
0
 def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs):
     self.eval_mode()
     t_ob = torch.from_numpy(ob).float().to(cfg.alg.device).unsqueeze(dim=1)
     act_dist, val, out_hidden_state = self.get_act_val(
         t_ob, hidden_state=hidden_state)
     action = action_from_dist(act_dist, sample=sample)
     log_prob = action_log_prob(action, act_dist)
     entropy = action_entropy(act_dist, log_prob)
     action_info = dict(
         log_prob=torch_to_np(log_prob.squeeze(1)),
         entropy=torch_to_np(entropy.squeeze(1)),
         val=torch_to_np(val.squeeze(1)),
     )
     return torch_to_np(action.squeeze(1)), action_info, out_hidden_state
示例#5
0
    def optimize(self, data, *args, **kwargs):
        pre_res = self.optim_preprocess(data)
        processed_data = pre_res
        processed_data['entropy'] = torch.mean(processed_data['entropy'])
        loss_res = self.cal_loss(**processed_data)
        loss, pg_loss, vf_loss, ratio = loss_res
        self.optimizer.zero_grad()
        loss.backward()

        grad_norm = clip_grad(self.all_params, cfg.alg.max_grad_norm)
        self.optimizer.step()
        with torch.no_grad():
            approx_kl = 0.5 * torch.mean(
                torch.pow(
                    processed_data['old_log_prob'] -
                    processed_data['log_prob'], 2))
            clip_frac = np.mean(
                np.abs(torch_to_np(ratio) - 1.0) > cfg.alg.clip_range)
        optim_info = dict(pg_loss=pg_loss.item(),
                          vf_loss=vf_loss.item(),
                          total_loss=loss.item(),
                          entropy=processed_data['entropy'].item(),
                          approx_kl=approx_kl.item(),
                          clip_frac=clip_frac)
        optim_info['grad_norm'] = grad_norm
        return optim_info
示例#6
0
 def get_action(self, ob, sample=True, *args, **kwargs):
     self.eval_mode()
     ob = torch_float(ob, device=cfg.alg.device)
     act_dist = self.actor(ob)[0]
     action = action_from_dist(act_dist, sample=sample)
     action_info = dict()
     return torch_to_np(action), action_info
示例#7
0
    def get_action(self, ob, sample=True, hidden_state=None, *args, **kwargs):
        self.eval_mode()

        if type(ob) is dict:
            t_ob = {
                key: torch_float(ob[key], device=cfg.alg.device)
                for key in ob
            }
        else:
            t_ob = torch.from_numpy(ob).float().to(
                cfg.alg.device).unsqueeze(dim=1)

        act_dist, val, out_hidden_state = self.get_act_val(
            t_ob, hidden_state=hidden_state)
        action = action_from_dist(act_dist, sample=sample)
        log_prob = action_log_prob(action, act_dist)
        entropy = action_entropy(act_dist, log_prob)
        in_hidden_state = torch_to_np(
            hidden_state) if hidden_state is not None else hidden_state
        action_info = dict(log_prob=torch_to_np(log_prob.squeeze(1)),
                           entropy=torch_to_np(entropy.squeeze(1)),
                           val=torch_to_np(val.squeeze(1)),
                           in_hidden_state=in_hidden_state)
        return torch_to_np(action.squeeze(1)), action_info, out_hidden_state
示例#8
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_first=False,
                 reset_kwargs=None,
                 action_kwargs=None,
                 get_last_val=False):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        # In RL^2, we should always reset in the begining of a rollout
        if self.obs is None or reset_first or evaluation:
            self.reset(**reset_kwargs)
        ob = self.obs
        hidden_state = self.hidden_states
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        else:
            all_dones = None
        done = None
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = deepcopy(env.get_images())

            action, action_info, hidden_state = self.agent.get_action(
                ob, sample=sample, hidden_state=hidden_state, **action_kwargs)
            #print('action_info', action_info)
            if self.hidden_state_shape is None:
                self.hidden_state_shape = hidden_state.shape
            next_ob, reward, done, info = env.step(action)

            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            true_next_ob, true_done, all_dones = self.get_true_done_next_ob(
                next_ob, done, reward, info, all_dones)

            sd = StepData(
                ob=ob,
                action=action,
                action_info=action_info,
                next_ob=true_next_ob,
                reward=reward,
                done=true_done,
                info=info,
                extra=
                done,  # this is a flag that can tell whether the environment
                # is reset or not so that we know whether we need to
                # reset the hidden state or not. We save it in "extra"
            )
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break

            # the order of next few lines matter, do not exchange
            if get_last_val and not evaluation and t == time_steps - 1:
                last_val, _ = self.agent.get_val(traj[-1].next_ob_raw,
                                                 hidden_state=hidden_state)
                if last_val is not None:
                    traj.add_extra('last_val', torch_to_np(last_val))
                else:
                    traj.add_extra('last_val', None)
            hidden_state = self.check_hidden_state(hidden_state, done=done)
        self.obs = ob if not evaluation else None
        self.hidden_states = hidden_state.detach() if not evaluation else None
        return traj
示例#9
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_first=False,
                 reset_kwargs=None,
                 action_kwargs=None,
                 random_action=False,
                 get_last_val=False):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        if self.obs is None or reset_first or evaluation:
            self.reset(env=env, **reset_kwargs)
        ob = self.obs
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        else:
            all_dones = None
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = get_render_images(env)

            if random_action:
                action = env.random_actions()
                action_info = dict()
            else:
                action, action_info = self.agent.get_action(ob,
                                                            sample=sample,
                                                            **action_kwargs)
            next_ob, reward, done, info = env.step(action)

            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            true_next_ob, true_done, all_dones = self.get_true_done_next_ob(
                next_ob, done, reward, info, all_dones, skip_record=evaluation)
            sd = StepData(ob=ob,
                          action=action,
                          action_info=action_info,
                          next_ob=true_next_ob,
                          reward=reward,
                          done=true_done,
                          info=info)
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break

        if get_last_val and not evaluation:
            last_val = self.agent.get_val(traj[-1].next_ob)
            traj.add_extra('last_val', torch_to_np(last_val))
        self.obs = ob if not evaluation else None
        return traj
示例#10
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_kwargs=None,
                 action_kwargs=None):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        ob = env.reset(**reset_kwargs)
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = deepcopy(env.get_images())

            action, action_info = self.agent.get_action(ob,
                                                        sample=sample,
                                                        **action_kwargs)
            next_ob, reward, done, info = env.step(action)
            next_ob = deepcopy(next_ob)
            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            done_idx = np.argwhere(done).flatten()
            if done_idx.size > 0 and return_on_done:
                # vec env automatically resets the environment when it's done
                # so the returned next_ob is not actually the next observation
                all_dones[done_idx] = True
            sd = StepData(ob=ob,
                          action=deepcopy(action),
                          action_info=deepcopy(action_info),
                          next_ob=next_ob,
                          reward=deepcopy(reward),
                          done=deepcopy(done),
                          info=deepcopy(info))
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break
        if not evaluation:
            #print("next_ob:", traj[-1].next_ob)
            last_val = self.agent.get_val(traj[-1].next_ob_raw)
            traj.add_extra('last_val', torch_to_np(last_val))
        return traj