예제 #1
0
    def _preprocess_trajectory(
            self, trajectory: Tuple[np.ndarray, ...]) -> Tuple[torch.Tensor]:
        """Preprocess trajectory for pytorch training"""
        states, actions, rewards = trajectory

        states = np2tensor(states, self.worker.device)
        actions = np2tensor(actions.reshape(-1, 1), self.worker.device)
        rewards = np2tensor(rewards.reshape(-1, 1), self.worker.device)

        if self.worker.experiment_info.is_discrete:
            actions = actions.long()

        trajectory = (states, actions, rewards)

        return trajectory
예제 #2
0
 def __call__(self, policy: nn.Module,
              state: np.ndarray) -> Tuple[np.ndarray, ...]:
     """Generate action via policy"""
     if state.ndim == 1:
         state = state.reshape(1, -1)
     action = policy.forward(np2tensor(state, self.use_cuda))
     action_np = action.cpu().detach().view(-1).numpy()
     return action_np
예제 #3
0
 def __call__(self, policy: nn.Module,
              state: np.ndarray) -> Tuple[torch.Tensor, ...]:
     """Generate action via policy"""
     if state.ndim == 1:
         state = state.reshape(1, -1)
     mu, sigma, z, log_pi = policy.sample(np2tensor(state, self.use_cuda))
     action = torch.tanh(z)
     action_np = action.cpu().detach().view(-1).numpy()
     return action_np
예제 #4
0
    def __call__(self, policy: nn.Module,
                 state: np.ndarray) -> Tuple[np.ndarray, ...]:
        state = np2tensor(state, self.use_cuda).unsqueeze(0)
        with torch.no_grad():
            dist = policy.forward(state)
            weights = dist * policy.support
            qvals = weights.sum(dim=2).cpu().numpy()
        action = np.argmax(qvals)

        return action
예제 #5
0
 def __call__(self, policy: nn.Module,
              state: np.ndarray) -> Tuple[np.ndarray, ...]:
     if state.ndim == 1:
         state = state.reshape(1, -1)
     state = np2tensor(state, self.use_cuda).unsqueeze(0)
     with torch.no_grad():
         qvals = policy.forward(state).mean(dim=2)
         qvals = qvals.cpu().numpy()
     action = np.argmax(qvals)
     return action
예제 #6
0
 def __call__(self, policy: BaseModel, state: np.ndarray) -> np.ndarray:
     if state.ndim == 1:
         state = state.reshape(1, -1)
     state = np2tensor(state, self.device)
     dist = policy.forward(state)
     categorical_dist = Categorical(dist)
     if self.exploration:
         action = categorical_dist.sample().cpu().detach().numpy()
     else:
         action = categorical_dist.sample().cpu().argmax().numpy()
     return action.item()
예제 #7
0
파일: agent.py 프로젝트: LaoKpa/RLcycle
    def _preprocess_experience(self, experience: Tuple[np.ndarray]):
        states, actions, rewards, next_states, dones = experience[:5]
        if self.hyper_params.use_per:
            indices, weights = experience[-2:]

        states = np2tensor(states, self.device)
        actions = np2tensor(actions, self.device)
        rewards = np2tensor(rewards.reshape(-1, 1), self.device)
        next_states = np2tensor(next_states, self.device)
        dones = np2tensor(dones.reshape(-1, 1), self.device)

        experience = (states, actions, rewards, next_states, dones)

        if self.hyper_params.use_per:
            weights = np2tensor(weights.reshape(-1, 1), self.device)
            experience = experience + (indices, weights,)

        return experience
예제 #8
0
    def _preprocess_experience(
            self, experience: Tuple[np.ndarray]) -> Tuple[torch.Tensor]:
        """Convert collected experience to pytorch tensors."""
        states, actions, rewards, next_states, dones = experience[:5]
        if self.hyper_params.use_per:
            indices, weights = experience[-2:]

        states = np2tensor(states, self.use_cuda)
        actions = np2tensor(actions, self.use_cuda)
        rewards = np2tensor(rewards.reshape(-1, 1), self.use_cuda)
        next_states = np2tensor(next_states, self.use_cuda)
        dones = np2tensor(dones.reshape(-1, 1), self.use_cuda)

        experience = (states, actions, rewards, next_states, dones)

        if self.hyper_params.use_per:
            weights = np2tensor(weights.reshape(-1, 1), self.use_cuda)
            experience = experience + (
                indices,
                weights,
            )

        return experience