예제 #1
0
def generate_logs(episodes: int, max_horizon: int, policy: RLPolicy):
    """
    Args:
        episodes: number of episodes to generate
        max_horizon: max horizon of each episode
        policy: RLPolicy which uses real-valued states
    """
    log = []
    env = gym.make("CartPole-v0")
    for _ in range(episodes):
        init_state = env.reset()
        cur_state = init_state
        mdp = []
        for _ in range(max_horizon):
            action_dist = policy(State(cur_state))
            action = action_dist.sample()[0].value
            action_prob = action_dist.probability(Action(action))
            next_state, _, done, _ = env.step(action)
            mdp.append(
                Transition(
                    last_state=State(cur_state),
                    action=Action(action),
                    action_prob=action_prob,
                    state=State(next_state),
                    reward=1.0,
                    status=Transition.Status.NORMAL,
                )
            )
            cur_state = next_state
            if done:
                log.append(mdp)
                break
        log.append(mdp)
    return log
예제 #2
0
    def edp_to_rl_input(edp: EvaluationDataPage,
                        gamma,
                        device=None) -> RLEstimatorInput:
        assert edp.model_values is not None
        eq_len = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories(
            edp.mdp_id,
            edp.action_mask.cpu().numpy(),
            edp.logged_rewards.cpu().numpy().flatten(),
            edp.logged_propensities.cpu().numpy().flatten(),
            edp.model_propensities.cpu().numpy(),
            edp.model_values.cpu().numpy(),
        )

        (
            actions,
            rewards,
            logged_propensities,
            target_propensities,
            estimated_q_values,
        ) = (torch.tensor(x,
                          dtype=torch.double,
                          device=device,
                          requires_grad=True) for x in eq_len)

        num_examples = logged_propensities.shape[0]
        horizon = logged_propensities.shape[1]

        log = {}
        for traj in range(num_examples):
            if State(0) not in log:
                log[State(0)] = []
            log[State(0)].append([
                Transition(
                    last_state=State((traj, i)),
                    action=torch.argmax(actions[traj, i]).item(),
                    action_prob=logged_propensities[traj, i].item(),
                    state=State((traj, i + 1)),
                    reward=rewards[traj, i].item(),
                ) for i in range(horizon - 1)
                if actions[traj, i][torch.argmax(actions[traj,
                                                         i]).item()] != 0.0
            ])

        return RLEstimatorInput(
            gamma=gamma,
            log=log,
            target_policy=SequentialOPEstimatorAdapter.EDPSeqPolicy(
                actions.shape[2], target_propensities),
            value_function=SequentialOPEstimatorAdapter.EDPValueFunc(
                estimated_q_values, target_propensities),
            ground_truth=None,
            horizon=horizon,
        )
예제 #3
0
 def _next_state_reward(self, state: State, action: Action) -> StateReward:
     x, y = state.value
     if state.value in self.walls or state.value == self.goal:
         return StateReward(State((x, y), state.is_terminal), 0.0)
     if action.value == 0:
         to_pos, reward, is_end = self._transit((x, y), (x + 1, y))
     elif action.value == 1:
         to_pos, reward, is_end = self._transit((x, y), (x, y + 1))
     elif action.value == 2:
         to_pos, reward, is_end = self._transit((x, y), (x - 1, y))
     else:
         to_pos, reward, is_end = self._transit((x, y), (x, y - 1))
     return StateReward(State(to_pos, is_end), reward)
예제 #4
0
 def dump_value_func(self, valfunc: ValueFunction) -> str:
     dump = ""
     for x in range(self.size[0]):
         for y in range(self.size[1]):
             dump += "{:6.3}".format(valfunc(State((x, y))))
         dump += "\n"
     return dump
예제 #5
0
 def _next_state_reward(self, state: State, action: Action) -> StateReward:
     value = state.value
     assert isinstance(value,
                       tuple), f"got type {type(value)} instead of tuple"
     (x, y) = value
     assert isinstance(x, int) and isinstance(
         y, int), "Gridworld expects states to be Tuple[int, int]"
     if state.value in self.walls or state.value == self.goal:
         return StateReward(State((x, y), state.is_terminal), 0.0)
     if action.value == 0:
         to_pos, reward, is_end = self._transit((x, y), (x + 1, y))
     elif action.value == 1:
         to_pos, reward, is_end = self._transit((x, y), (x, y + 1))
     elif action.value == 2:
         to_pos, reward, is_end = self._transit((x, y), (x - 1, y))
     else:
         to_pos, reward, is_end = self._transit((x, y), (x, y - 1))
     return StateReward(State(to_pos, is_end), reward)
예제 #6
0
 def dump_state_values(self, state_values) -> str:
     dump = ""
     for x in range(self.size[0]):
         for y in range(self.size[1]):
             pos = State((x, y))
             value = 0.0
             if pos in state_values:
                 value = state_values[pos]
             dump += "{:6.3}".format(value)
         dump += "\n"
     return dump
예제 #7
0
 def next_state_reward_dist(self, state: State,
                            action: Action) -> StateDistribution:
     self._model.eval()
     state_reward_tensor = (self._model(
         torch.tensor(state.value, dtype=torch.float).reshape(
             -1, self._model._state_dim).to(self._device),
         torch.nn.functional.one_hot(
             torch.tensor(action.value, dtype=torch.long),
             self._model._action_dim,
         ).reshape(-1, self._model._action_dim).float().to(self._device),
     ).reshape(-1).cpu())
     return {
         State(state_reward_tensor[:self._model._state_dim]):
         RewardProbability(state_reward_tensor[-1].item())
     }
예제 #8
0
def estimate_value(episodes: int, max_horizon: int, policy: RLPolicy, gamma: float):
    avg = RunningAverage()
    env = gym.make("CartPole-v0")
    for _ in range(episodes):
        init_state = env.reset()
        cur_state = init_state
        r = 0.0
        discount = 1.0
        for _ in range(max_horizon):
            action_dist = policy(State(cur_state))
            action = action_dist.sample()[0].value
            next_state, _, done, _ = env.step(action)
            reward = 1.0
            r += reward * discount
            discount *= gamma
            if done:
                break
            cur_state = next_state
        avg.add(r)
    return avg.average
예제 #9
0
 def dump_policy(self, policy) -> str:
     dump = ""
     for x in range(self.size[0]):
         for y in range(self.size[1]):
             pos = (x, y)
             if pos == self.start:
                 dump += "\u2b28"
             elif pos == self.goal:
                 dump += "\u2b27"
             elif pos in self.walls:
                 dump += "\u2588"
             else:
                 action = policy(State(pos)).greedy()
                 if action.value == 0:
                     dump += "\u21e9"
                 elif action.value == 1:
                     dump += "\u21e8"
                 elif action.value == 2:
                     dump += "\u21e7"
                 else:
                     dump += "\u21e6"
         dump += "\n"
     return dump
예제 #10
0
 def reset(self, state: Optional[State] = None):
     super().reset(state)
     if self._current_state is None:
         self._current_state = State(self.start)
     return self._current_state
예제 #11
0
 def states(self):
     for x in range(self.size[0]):
         for y in range(self.size[1]):
             state = (x, y)
             if state != self.goal and state not in self.walls:
                 yield State((x, y))