class CartPole(Environment): """ The Inverted Pendulum on a Cart environment as presented in: "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003. """ def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (float, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info) def reset(self, state=None): if state is None: angle = np.random.uniform(-np.pi / 8., np.pi / 8.) self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_u = 0 return self._state def step(self, action): if action == 0: u = -self._max_u elif action == 1: u = 0. else: u = self._max_u self._last_u = u u += np.random.uniform(-self._noise_u, self._noise_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u, )) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if np.abs(self._state[0]) > np.pi * .5: reward = -1. absorbing = True else: reward = 0. absorbing = False return self._state, reward, absorbing, {} def render(self, mode='human'): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.square(start, 0, self._l / 10) self._viewer.circle(end, self._l / 20) direction = -np.sign(self._last_u) * np.array([1, 0]) value = np.abs(self._last_u) self._viewer.force_arrow(start, direction, value, self._max_u, self._l / 5) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _dynamics(self, state, t, u): theta = state[0] omega = state[1] d_theta = omega d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l * .5 * d_theta**2 * np.sin(2 * theta) * .5 - self._alpha * np.cos(theta) * u) / ( 2 / 3 * self._l - self._alpha * self._m * self._l * .5 * np.cos(theta)**2) return d_theta, d_omega
class AbstractGridWorld(Environment): """ Abstract class to build a grid world. """ def __init__(self, mdp_info, height, width, start, goal): """ Constructor. Args: height (int): height of the grid; width (int): width of the grid; start (tuple): x-y coordinates of the goal; goal (tuple): x-y coordinates of the goal. """ assert not np.array_equal(start, goal) assert goal[0] < height and goal[1] < width,\ 'Goal position not suitable for the grid world dimension.' self._state = None self._height = height self._width = width self._start = start self._goal = goal # Visualization self._viewer = Viewer(self._width, self._height, 500, self._height * 500 // self._width) super().__init__(mdp_info) def reset(self, state=None): if state is None: state = self.convert_to_int(self._start, self._width) self._state = state return self._state def step(self, action): state = self.convert_to_grid(self._state, self._width) new_state, reward, absorbing, info = self._step(state, action) self._state = self.convert_to_int(new_state, self._width) return self._state, reward, absorbing, info def render(self): for row in range(1, self._height): for col in range(1, self._width): self._viewer.line(np.array([col, 0]), np.array([col, self._height])) self._viewer.line(np.array([0, row]), np.array([self._width, row])) goal_center = np.array( [.5 + self._goal[1], self._height - (.5 + self._goal[0])]) self._viewer.square(goal_center, 0, 1, (0, 255, 0)) start_grid = self.convert_to_grid(self._start, self._width) start_center = np.array( [.5 + start_grid[1], self._height - (.5 + start_grid[0])]) self._viewer.square(start_center, 0, 1, (255, 0, 0)) state_grid = self.convert_to_grid(self._state, self._width) state_center = np.array( [.5 + state_grid[1], self._height - (.5 + state_grid[0])]) self._viewer.circle(state_center, .4, (0, 0, 255)) self._viewer.display(.1) def _step(self, state, action): raise NotImplementedError('AbstractGridWorld is an abstract class.') def _grid_step(self, state, action): if action == 0: if state[0] > 0: state[0] -= 1 elif action == 1: if state[0] + 1 < self._height: state[0] += 1 elif action == 2: if state[1] > 0: state[1] -= 1 elif action == 3: if state[1] + 1 < self._width: state[1] += 1 @staticmethod def convert_to_grid(state, width): return np.array([state[0] // width, state[0] % width]) @staticmethod def convert_to_int(state, width): return np.array([state[0] * width + state[1]])