def update_grid_policy_ace(self, algorithm: TabularAlgorithm, usable_ace: bool): policy: TabularPolicy = algorithm.target_policy # policy_: policy.Deterministic for s, state in enumerate(self.states): if not state.is_terminal and state.usable_ace == usable_ace: # dealer_card is x, player_sum is y : following the table in the book x = state.dealers_card - self.dealers_card_min y = state.player_sum - self.player_sum_min position: common.XY = common.XY(x, y) action: Action = policy.get_action(s) # type: ignore policy_value: int = int(action.hit) # print(position, transfer_1_to_2) self.grid_world.set_policy_value( position=position, policy_value=policy_value, ) if algorithm.Q: policy_a: int = policy[s] is_terminal: bool = self.is_terminal[s] for a, action in enumerate(self.actions): if self.s_a_compatibility[s, a]: is_policy: bool = (not is_terminal and policy_a == a) if action.hit: y = 1 else: y = -1 move: common.XY = common.XY(0, y) self.grid_world.set_move_q_value( position=position, move=move, q_value=algorithm.Q[s, a], is_policy=is_policy )
def q_test() -> bool: environment_parameters = EnvironmentParameters( environment_type=common.EnvironmentType.CLIFF, actions_list=common.ActionsList.FOUR_MOVES ) environment = Environment(environment_parameters) environment.build() q = state_action_function.StateActionFunction(environment, initial_value=-7.0) state_ = State(is_terminal=False, position=common.XY(x=4, y=2)) s = environment.state_index[state_] print(f"state_.index {s}") action_ = Action(common.XY(x=1, y=0)) a = environment.action_index[action_] print(f"action_.index {a}") print(q[s, a]) q[s, a] = 2.0 q[s, a] += 0.5 print(q[s, a]) # noinspection PyProtectedMember print(f"Q: {q.matrix}") return True
def cliff_test() -> bool: environment_parameters = EnvironmentParameters() environment = Environment(environment_parameters) environment.build() print(type(environment)) for state_ in environment.states: state_index = environment.state_index[state_] print(f"{state_} \t index={state_index}") print() for action_ in environment.actions: action_index = environment.action_index[action_] print(f"{action_} \t index={action_index}") print() state_ = state.State(is_terminal=False, position=common.XY(x=4, y=2)) action_ = action.Action(common.XY(x=1, y=0)) observation_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(observation_) state_ = state.State(is_terminal=False, position=common.XY(x=6, y=1)) action_ = action.Action(common.XY(x=0, y=-1)) observation_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(observation_) return True
def _four_cliff_friendly_moves() -> list[Action]: return [ # right Action(move=common.XY(+1, 0)), # up Action(move=common.XY(0, +1)), # left Action(move=common.XY(-1, 0)), # down Action(move=common.XY(0, -1)) ]
def _build_actions(self): # important this is the default for e-greedy else never terminates new_action: Action = Action(acceleration=common.XY(x=0, y=0)) self.actions.append(new_action) for ax in range(self._min_ax, self._max_ax + 1): for ay in range(self._min_ay, self._max_ay + 1): if ax != 0 and ay != 0: new_action: Action = Action( acceleration=common.XY(x=ax, y=ay)) self.actions.append(new_action)
def _build_states(self): """set S""" for x in range(self.grid_world.max_x + 1): for y in range(self.grid_world.max_y + 1): position: common.XY = common.XY(x=x, y=y) is_terminal: bool = self.grid_world.is_at_goal(position) for vx in range(self._min_vx, self._max_vx + 1): for vy in range(self._min_vy, self._max_vy + 1): new_state: State = State(is_terminal=is_terminal, position=position, velocity=common.XY(x=vx, y=vy)) self.states.append(new_state)
def change_request(self, current_position: common.XY, move: Optional[common.XY]) -> common.XY: move = self._get_random_movement() requested_position: common.XY = common.XY( x=current_position.x + move.x, y=current_position.y + move.y) # project back to grid if outside new_position: common.XY = self.project_back_to_grid(requested_position) return new_position
def _load_gridworld(self): self._set_sizes() self._grid_surface.fill(self._background_color) for x in range(self._max_x + 1): for y in range(self._max_y + 1): position: common.XY = common.XY(x, y) self._draw_square(self._grid_surface, position, draw_background=True) self._copy_grid_into_background()
def change_request(self, position: common.XY, velocity: common.XY, acceleration: common.XY)\ -> tuple[common.XY, common.XY]: u: float = utils.uniform() if u > self.skid_probability: # not skidding new_velocity = common.XY( x=velocity.x + acceleration.x, y=velocity.y + acceleration.y ) else: # skid new_velocity = velocity new_position: common.XY = common.XY( x=position.x + new_velocity.x, y=position.y + new_velocity.y ) # project back to grid if outside # new_position: common.XY = self._project_back_to_grid(expected_position) return new_position, new_velocity
def change_request(self, current_position: common.XY, move: common.XY) -> common.XY: wind = self._get_wind(current_position) requested_position: common.XY = common.XY( x=current_position.x + move.x + wind.x, y=current_position.y + move.y + wind.y) # project back to grid if outside new_position: common.XY = self.project_back_to_grid(requested_position) return new_position
def _kings_moves(include_center: bool = False) -> list[Action]: action_list: list[Action] = [] for x in (-1, 0, 1): for y in (-1, 0, 1): include: bool = True if x == 0 and y == 0: include = include_center if include: action_list.append(Action(move=common.XY(x, y))) return action_list
def _get_wind(self, current_position: common.XY) -> common.XY: extra_wind: int if self.random_wind: extra_wind = self.random_wind_distribution.draw_one() else: extra_wind = 0 wind = common.XY(x=0, y=self.upward_wind[current_position.x] + extra_wind) return wind
def get_start_states(self) -> list[State]: start_positions: list[ common.XY] = self._grid_world.get_start_positions() start_velocity = common.XY(x=0, y=0) start_states = [ State(is_terminal=False, position=position, velocity=start_velocity) for position in start_positions ] return start_states
def change_request(self, current_position: common.XY, move: Optional[common.XY]) -> common.XY: if move is None: requested_position: common.XY = current_position else: requested_position: common.XY = common.XY( x=current_position.x + move.x, y=current_position.y + move.y ) # project back to grid if outside new_position: common.XY = self.project_back_to_grid(requested_position) return new_position
def _build_states(self): """set S""" for x in range(self.grid_world.max_x + 1): for y in range(self.grid_world.max_y + 1): position = common.XY(x=x, y=y) is_terminal: bool = self.grid_world.is_at_goal(position) new_state: State = State( position=position, is_terminal=is_terminal, ) self.states.append(new_state)
def project_back_to_grid(self, requested_position: common.XY) -> common.XY: x = requested_position.x y = requested_position.y if x < 0: x = 0 if y < 0: y = 0 if x > self.max_x: x = self.max_x if y > self.max_y: y = self.max_y return common.XY(x=x, y=y)
def update_grid_policy(self, policy: TabularPolicy): # policy_: policy.Deterministic for s, state in enumerate(self.states): position: common.XY = common.XY( x=state.ending_cars_2, y=state.ending_cars_1) # reversed like in book action: Action = policy.get_action(s) # type: ignore transfer_1_to_2: int = action.transfer_1_to_2 # print(position, transfer_1_to_2) self.grid_world.set_policy_value( position=position, policy_value=transfer_1_to_2, )
def _load_gridworld(self): self._set_sizes() self._grid_surface.fill(self._background_color) for x in range(self._max_x + 1): for y in range(self._max_y + 1): square: common.Square = self._grid_world.get_square( position=common.XY(x, y)) if self._display_v: v = self._grid_world.v[y, x] self._draw_square(x, y, square, self._grid_surface, v=v) else: self._draw_square(x, y, square, self._grid_surface) self._copy_grid_into_background()
def _draw_policy(self, surface: pygame.Surface, rect: pygame.Rect, output_square: common.OutputSquare): policy_value = output_square.policy_value if policy_value is not None: policy_color: pygame.Color = self._get_policy_value_color( policy_value) pygame.draw.rect(surface, policy_color, rect) if policy_value == 1: text = "Hit" else: text = "Stick" sub_rect = self._get_sub_rect(rect, move=common.XY(x=0, y=0)) self._center_text(surface, sub_rect, text)
def racetrack_test() -> bool: environment_parameters = EnvironmentParameters(grid=grids.TRACK_1) environment = Environment(environment_parameters) environment.build() for state_ in environment.states: state_index = environment.state_index[state_] print(f"{state_} \t index={state_index}") print() for action_ in environment.actions: action_index = environment.action_index[action_] print(f"{action_} \t index={action_index}") print() state_ = state.State(is_terminal=False, position=common.XY(x=4, y=0), velocity=common.XY(x=0, y=1)) action_ = action.Action(acceleration=common.XY(x=1, y=0)) response_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(response_) state_ = state.State(is_terminal=False, position=common.XY(x=5, y=4), velocity=common.XY(x=1, y=0)) action_ = action.Action(acceleration=common.XY(x=0, y=0)) response_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(response_) state_ = state.State(is_terminal=False, position=common.XY(x=0, y=0), velocity=common.XY(x=0, y=3)) action_ = action.Action(common.XY(x=0, y=-1)) response_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(response_) return True
def draw_response(self, state: State, action: Action) -> tuple[float, State]: """ draw a single outcome for a single state and action standard call for episodic algorithms """ self._draw_next_state(state, action) if self._next_state.position == common.XY(x=self._grid_world.max_x, y=0): reward = 1.0 else: reward = 0.0 return reward, self._next_state
def grid_test() -> bool: environment_parameters = EnvironmentParameters( actions_list=common.ActionsList.FOUR_MOVES) environment = Environment(environment_parameters) grid_world_ = environment.grid_world shape = grid_world_.max_y + 1, grid_world_.max_x + 1 cartesian_grid = np.empty(shape=shape, dtype=int) # noinspection PyTypeChecker for y, x in np.ndindex(cartesian_grid.shape): position: common.XY = common.XY(x, y) square: int = grid_world_.get_square(position) cartesian_grid[y, x] = square print(cartesian_grid) return True
def _draw_frame_on_background(self, agent_position: Optional[common.XY] = None, agent_move: Optional[common.XY] = None, prev_position: Optional[common.XY] = None, prev_move: Optional[common.XY] = None ): for x in range(self._max_x + 1): for y in range(self._max_y + 1): position: common.XY = common.XY(x, y) if position == agent_position: self._draw_agent_on_background(agent_position, agent_move) elif position == prev_position: self._draw_prev_on_background(prev_position, prev_move) else: self._draw_square(surface=self._background, position=position )
def random_walk_test() -> bool: environment_parameters = EnvironmentParameters( actions_list=common.ActionsList.NO_ACTIONS ) environment = Environment(environment_parameters) environment.build() for state_ in environment.states: state_index = environment.state_index[state_] print(f"{state_} \t index={state_index}") print() for action_ in environment.actions: action_index = environment.action_index[action_] print(f"{action_} \t index={action_index}") print() state_ = state.State(is_terminal=False, position=common.XY(x=4, y=0)) action_ = action.Action(common.XY(x=1, y=0)) observation_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(observation_) state_ = state.State(is_terminal=False, position=common.XY(x=5, y=0)) action_ = action.Action(common.XY(x=1, y=0)) observation_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(observation_) state_ = state.State(is_terminal=False, position=common.XY(x=0, y=0)) action_ = action.Action(common.XY(x=-1, y=0)) observation_ = environment.from_state_perform_action(state_, action_) print(state_, action_) print(observation_) return True
def _draw_policy(self, surface: pygame.Surface, rect: pygame.Rect, output_square: common.OutputSquare): if output_square.policy_value is not None: text: str = f"{output_square.policy_value:.1f}" sub_rect = self._get_sub_rect(rect, move=common.XY(x=0, y=0)) self._center_text(surface, sub_rect, text)
def _get_random_movement(self) -> common.XY: x_random: int = self._random_move_distribution.draw_one() return common.XY(x=x_random, y=0)
def _draw_square(self, x: int, y: int, square: common.Square, surface: pygame.Surface, v: Optional[float] = None) -> pygame.Rect: row = self._max_y - y col = x color: pygame.Color = self._color_lookup[square] left: int = col * self._cell_pixels top: int = row * self._cell_pixels width: int = self._cell_pixels - 1 height: int = self._cell_pixels - 1 # doesn'_t like named parameters square_rect: pygame.Rect = pygame.Rect(left, top, width, height) pygame.draw.rect(surface, color, square_rect) text: str = "12.3" move: common.XY = common.XY(x=-1, y=1) sub_rect = self._get_sub_rect(square_rect, move) self._center_text(surface, sub_rect, text) # sub_width: float = square_rect.width / 3.0 # sub_height: float = square_rect.height / 3.0 # sub_left: float = square_rect.left + (move.x + 1)*sub_width # sub_top: float = square_rect.top + (1 - move.y)*sub_height # sub_rect: pygame.Rect = pygame.Rect(sub_left, sub_top, sub_width, sub_height) # print(f"rect = {square_rect}") # print(f"sub = {sub_rect}") # print("next...") # bounds = self._font.get_rect(text) # bounds.center = (0, 0) # bounds.move_ip(square_rect.center) # self._font.render_to(surface, bounds.topleft, text) # print(bounds.x, bounds.y, bounds.w, bounds.h) # print(bounds.center) # bounds.center = (0, 0) # print(bounds.x, bounds.y, bounds.w, bounds.h) # print(bounds.center) # bounds.center += square_rect.center # bounds.move_ip(square_rect.center) # bounds.x += square_rect.centerx # bounds.y += square_rect.centery # bounds.move(square_rect.center) # print(bounds.x, bounds.y, bounds.w, bounds.h) # print(bounds.center) # print(square_rect.center) # destination: tuple = (left, top) if v is not None: # write v in square_rect pass return square_rect
def get_start_positions(self) -> list[common.XY]: starts: np.ndarray = (self._grid[:, :] == common.Square.START) starts_flat: np.ndarray = np.flatnonzero(starts) iy, ix = np.unravel_index(starts_flat, shape=self._grid.shape) positions = [self._position_flip(common.XY(ix[i], iy[i])) for i in range(ix.shape[0])] return positions
def _move_flip(self, xy_in: common.XY) -> common.XY: return common.XY(x=xy_in.x, y=-xy_in.y)
def _position_flip(self, xy_in: common.XY) -> common.XY: return common.XY(x=xy_in.x, y=self.max_y - xy_in.y)