def next_state_reward_dist(self, state: State, action: Action) -> StateDistribution: probs = [self.noise_prob] * len(self.action_space) probs[action.value] = 1 - self.epsilon states = {} for a in self.action_space: sr = self._gridworld._next_state_reward(state, a) if sr.state in states: rp = states[sr.state] states[sr.state] = RewardProbability(rp.reward + sr.reward, rp.prob + probs[a.value]) else: states[sr.state] = RewardProbability(sr.reward, probs[a.value]) return states
def next_state_reward_dist(self, state: State, action: Action) -> StateDistribution: probs = [self.noise_prob] * len(self.action_space) assert isinstance(action.value, int), f"got type {type(action.value)} instead of int" # pyre-fixme[16]: `int` has no attribute `__setitem__`. probs[action.value] = 1 - self.epsilon states = {} for a in self.action_space: sr = self._gridworld._next_state_reward(state, a) if sr.state in states: rp = states[sr.state] states[sr.state] = RewardProbability( rp.reward + sr.reward, # pyre-fixme[16]: `int` has no attribute `__getitem__`. rp.prob + probs[a.value], ) else: states[sr.state] = RewardProbability(sr.reward, probs[a.value]) return states
def next_state_reward_dist(self, state: State, action: Action) -> StateDistribution: self._model.eval() state_reward_tensor = (self._model( torch.tensor(state.value, dtype=torch.float).reshape( -1, self._model._state_dim).to(self._device), torch.nn.functional.one_hot( torch.tensor(action.value, dtype=torch.long), self._model._action_dim, ).reshape(-1, self._model._action_dim).float().to(self._device), ).reshape(-1).cpu()) return { State(state_reward_tensor[:self._model._state_dim]): RewardProbability(state_reward_tensor[-1].item()) }
def next_state_reward_dist(self, state: State, action: Action) -> StateDistribution: sr = self._next_state_reward(state, action) return {sr.state: RewardProbability(sr.reward, 1.0)}