예제 #1
0
 def next_state_reward_dist(self, state: State,
                            action: Action) -> StateDistribution:
     probs = [self.noise_prob] * len(self.action_space)
     probs[action.value] = 1 - self.epsilon
     states = {}
     for a in self.action_space:
         sr = self._gridworld._next_state_reward(state, a)
         if sr.state in states:
             rp = states[sr.state]
             states[sr.state] = RewardProbability(rp.reward + sr.reward,
                                                  rp.prob + probs[a.value])
         else:
             states[sr.state] = RewardProbability(sr.reward, probs[a.value])
     return states
예제 #2
0
 def next_state_reward_dist(self, state: State,
                            action: Action) -> StateDistribution:
     probs = [self.noise_prob] * len(self.action_space)
     assert isinstance(action.value,
                       int), f"got type {type(action.value)} instead of int"
     # pyre-fixme[16]: `int` has no attribute `__setitem__`.
     probs[action.value] = 1 - self.epsilon
     states = {}
     for a in self.action_space:
         sr = self._gridworld._next_state_reward(state, a)
         if sr.state in states:
             rp = states[sr.state]
             states[sr.state] = RewardProbability(
                 rp.reward + sr.reward,
                 # pyre-fixme[16]: `int` has no attribute `__getitem__`.
                 rp.prob + probs[a.value],
             )
         else:
             states[sr.state] = RewardProbability(sr.reward, probs[a.value])
     return states
예제 #3
0
 def next_state_reward_dist(self, state: State,
                            action: Action) -> StateDistribution:
     self._model.eval()
     state_reward_tensor = (self._model(
         torch.tensor(state.value, dtype=torch.float).reshape(
             -1, self._model._state_dim).to(self._device),
         torch.nn.functional.one_hot(
             torch.tensor(action.value, dtype=torch.long),
             self._model._action_dim,
         ).reshape(-1, self._model._action_dim).float().to(self._device),
     ).reshape(-1).cpu())
     return {
         State(state_reward_tensor[:self._model._state_dim]):
         RewardProbability(state_reward_tensor[-1].item())
     }
예제 #4
0
 def next_state_reward_dist(self, state: State,
                            action: Action) -> StateDistribution:
     sr = self._next_state_reward(state, action)
     return {sr.state: RewardProbability(sr.reward, 1.0)}