def _preprocess_trajectory( self, trajectory: Tuple[np.ndarray, ...]) -> Tuple[torch.Tensor]: """Preprocess trajectory for pytorch training""" states, actions, rewards = trajectory states = np2tensor(states, self.worker.device) actions = np2tensor(actions.reshape(-1, 1), self.worker.device) rewards = np2tensor(rewards.reshape(-1, 1), self.worker.device) if self.worker.experiment_info.is_discrete: actions = actions.long() trajectory = (states, actions, rewards) return trajectory
def __call__(self, policy: nn.Module, state: np.ndarray) -> Tuple[np.ndarray, ...]: """Generate action via policy""" if state.ndim == 1: state = state.reshape(1, -1) action = policy.forward(np2tensor(state, self.use_cuda)) action_np = action.cpu().detach().view(-1).numpy() return action_np
def __call__(self, policy: nn.Module, state: np.ndarray) -> Tuple[torch.Tensor, ...]: """Generate action via policy""" if state.ndim == 1: state = state.reshape(1, -1) mu, sigma, z, log_pi = policy.sample(np2tensor(state, self.use_cuda)) action = torch.tanh(z) action_np = action.cpu().detach().view(-1).numpy() return action_np
def __call__(self, policy: nn.Module, state: np.ndarray) -> Tuple[np.ndarray, ...]: state = np2tensor(state, self.use_cuda).unsqueeze(0) with torch.no_grad(): dist = policy.forward(state) weights = dist * policy.support qvals = weights.sum(dim=2).cpu().numpy() action = np.argmax(qvals) return action
def __call__(self, policy: nn.Module, state: np.ndarray) -> Tuple[np.ndarray, ...]: if state.ndim == 1: state = state.reshape(1, -1) state = np2tensor(state, self.use_cuda).unsqueeze(0) with torch.no_grad(): qvals = policy.forward(state).mean(dim=2) qvals = qvals.cpu().numpy() action = np.argmax(qvals) return action
def __call__(self, policy: BaseModel, state: np.ndarray) -> np.ndarray: if state.ndim == 1: state = state.reshape(1, -1) state = np2tensor(state, self.device) dist = policy.forward(state) categorical_dist = Categorical(dist) if self.exploration: action = categorical_dist.sample().cpu().detach().numpy() else: action = categorical_dist.sample().cpu().argmax().numpy() return action.item()
def _preprocess_experience(self, experience: Tuple[np.ndarray]): states, actions, rewards, next_states, dones = experience[:5] if self.hyper_params.use_per: indices, weights = experience[-2:] states = np2tensor(states, self.device) actions = np2tensor(actions, self.device) rewards = np2tensor(rewards.reshape(-1, 1), self.device) next_states = np2tensor(next_states, self.device) dones = np2tensor(dones.reshape(-1, 1), self.device) experience = (states, actions, rewards, next_states, dones) if self.hyper_params.use_per: weights = np2tensor(weights.reshape(-1, 1), self.device) experience = experience + (indices, weights,) return experience
def _preprocess_experience( self, experience: Tuple[np.ndarray]) -> Tuple[torch.Tensor]: """Convert collected experience to pytorch tensors.""" states, actions, rewards, next_states, dones = experience[:5] if self.hyper_params.use_per: indices, weights = experience[-2:] states = np2tensor(states, self.use_cuda) actions = np2tensor(actions, self.use_cuda) rewards = np2tensor(rewards.reshape(-1, 1), self.use_cuda) next_states = np2tensor(next_states, self.use_cuda) dones = np2tensor(dones.reshape(-1, 1), self.use_cuda) experience = (states, actions, rewards, next_states, dones) if self.hyper_params.use_per: weights = np2tensor(weights.reshape(-1, 1), self.use_cuda) experience = experience + ( indices, weights, ) return experience