def generate_logs(episodes: int, max_horizon: int, policy: RLPolicy): """ Args: episodes: number of episodes to generate max_horizon: max horizon of each episode policy: RLPolicy which uses real-valued states """ log = [] env = gym.make("CartPole-v0") for _ in range(episodes): init_state = env.reset() cur_state = init_state mdp = [] for _ in range(max_horizon): action_dist = policy(State(cur_state)) action = action_dist.sample()[0].value action_prob = action_dist.probability(Action(action)) next_state, _, done, _ = env.step(action) mdp.append( Transition( last_state=State(cur_state), action=Action(action), action_prob=action_prob, state=State(next_state), reward=1.0, status=Transition.Status.NORMAL, ) ) cur_state = next_state if done: log.append(mdp) break log.append(mdp) return log
def edp_to_rl_input(edp: EvaluationDataPage, gamma, device=None) -> RLEstimatorInput: assert edp.model_values is not None eq_len = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories( edp.mdp_id, edp.action_mask.cpu().numpy(), edp.logged_rewards.cpu().numpy().flatten(), edp.logged_propensities.cpu().numpy().flatten(), edp.model_propensities.cpu().numpy(), edp.model_values.cpu().numpy(), ) ( actions, rewards, logged_propensities, target_propensities, estimated_q_values, ) = (torch.tensor(x, dtype=torch.double, device=device, requires_grad=True) for x in eq_len) num_examples = logged_propensities.shape[0] horizon = logged_propensities.shape[1] log = {} for traj in range(num_examples): if State(0) not in log: log[State(0)] = [] log[State(0)].append([ Transition( last_state=State((traj, i)), action=torch.argmax(actions[traj, i]).item(), action_prob=logged_propensities[traj, i].item(), state=State((traj, i + 1)), reward=rewards[traj, i].item(), ) for i in range(horizon - 1) if actions[traj, i][torch.argmax(actions[traj, i]).item()] != 0.0 ]) return RLEstimatorInput( gamma=gamma, log=log, target_policy=SequentialOPEstimatorAdapter.EDPSeqPolicy( actions.shape[2], target_propensities), value_function=SequentialOPEstimatorAdapter.EDPValueFunc( estimated_q_values, target_propensities), ground_truth=None, horizon=horizon, )
def _next_state_reward(self, state: State, action: Action) -> StateReward: x, y = state.value if state.value in self.walls or state.value == self.goal: return StateReward(State((x, y), state.is_terminal), 0.0) if action.value == 0: to_pos, reward, is_end = self._transit((x, y), (x + 1, y)) elif action.value == 1: to_pos, reward, is_end = self._transit((x, y), (x, y + 1)) elif action.value == 2: to_pos, reward, is_end = self._transit((x, y), (x - 1, y)) else: to_pos, reward, is_end = self._transit((x, y), (x, y - 1)) return StateReward(State(to_pos, is_end), reward)
def dump_value_func(self, valfunc: ValueFunction) -> str: dump = "" for x in range(self.size[0]): for y in range(self.size[1]): dump += "{:6.3}".format(valfunc(State((x, y)))) dump += "\n" return dump
def _next_state_reward(self, state: State, action: Action) -> StateReward: value = state.value assert isinstance(value, tuple), f"got type {type(value)} instead of tuple" (x, y) = value assert isinstance(x, int) and isinstance( y, int), "Gridworld expects states to be Tuple[int, int]" if state.value in self.walls or state.value == self.goal: return StateReward(State((x, y), state.is_terminal), 0.0) if action.value == 0: to_pos, reward, is_end = self._transit((x, y), (x + 1, y)) elif action.value == 1: to_pos, reward, is_end = self._transit((x, y), (x, y + 1)) elif action.value == 2: to_pos, reward, is_end = self._transit((x, y), (x - 1, y)) else: to_pos, reward, is_end = self._transit((x, y), (x, y - 1)) return StateReward(State(to_pos, is_end), reward)
def dump_state_values(self, state_values) -> str: dump = "" for x in range(self.size[0]): for y in range(self.size[1]): pos = State((x, y)) value = 0.0 if pos in state_values: value = state_values[pos] dump += "{:6.3}".format(value) dump += "\n" return dump
def next_state_reward_dist(self, state: State, action: Action) -> StateDistribution: self._model.eval() state_reward_tensor = (self._model( torch.tensor(state.value, dtype=torch.float).reshape( -1, self._model._state_dim).to(self._device), torch.nn.functional.one_hot( torch.tensor(action.value, dtype=torch.long), self._model._action_dim, ).reshape(-1, self._model._action_dim).float().to(self._device), ).reshape(-1).cpu()) return { State(state_reward_tensor[:self._model._state_dim]): RewardProbability(state_reward_tensor[-1].item()) }
def estimate_value(episodes: int, max_horizon: int, policy: RLPolicy, gamma: float): avg = RunningAverage() env = gym.make("CartPole-v0") for _ in range(episodes): init_state = env.reset() cur_state = init_state r = 0.0 discount = 1.0 for _ in range(max_horizon): action_dist = policy(State(cur_state)) action = action_dist.sample()[0].value next_state, _, done, _ = env.step(action) reward = 1.0 r += reward * discount discount *= gamma if done: break cur_state = next_state avg.add(r) return avg.average
def dump_policy(self, policy) -> str: dump = "" for x in range(self.size[0]): for y in range(self.size[1]): pos = (x, y) if pos == self.start: dump += "\u2b28" elif pos == self.goal: dump += "\u2b27" elif pos in self.walls: dump += "\u2588" else: action = policy(State(pos)).greedy() if action.value == 0: dump += "\u21e9" elif action.value == 1: dump += "\u21e8" elif action.value == 2: dump += "\u21e7" else: dump += "\u21e6" dump += "\n" return dump
def reset(self, state: Optional[State] = None): super().reset(state) if self._current_state is None: self._current_state = State(self.start) return self._current_state
def states(self): for x in range(self.size[0]): for y in range(self.size[1]): state = (x, y) if state != self.goal and state not in self.walls: yield State((x, y))