예제 #1
0
    def __init__(self, mdp_info, height, width, start, goal):
        """
        Constructor.

        Args:
            height (int): height of the grid;
            width (int): width of the grid;
            start (tuple): x-y coordinates of the goal;
            goal (tuple): x-y coordinates of the goal.

        """
        assert not np.array_equal(start, goal)

        assert goal[0] < height and goal[1] < width,\
            'Goal position not suitable for the grid world dimension.'

        self._state = None
        self._height = height
        self._width = width
        self._start = start
        self._goal = goal

        # Visualization
        self._viewer = Viewer(self._width, self._height, 500,
                              self._height * 500 // self._width)

        super().__init__(mdp_info)
예제 #2
0
    def __init__(self, n_steps_action=3, viz_speed=100, small=False):

        self.__name__ = 'ShipSteeringMultiGate'
        self.n_steps_action = n_steps_action
        self.viz_speed = viz_speed

        # MDP parameters
        self.no_of_gates = 4

        self.small = small

        self.field_size = 500 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12., 0])
        high = np.array([
            self.field_size, self.field_size, np.pi, np.pi / 12.,
            self.no_of_gates
        ])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2

        gate_1s = np.array([75, 175]) if small else np.array([150, 350])
        gate_1e = np.array([125, 175]) if small else np.array([250, 350])

        gate_1 = np.array([gate_1s, gate_1e])

        gate_2s = np.array([150, 300]) if small else np.array([300, 600])
        gate_2e = np.array([200, 300]) if small else np.array([400, 600])

        gate_2 = np.array([gate_2s, gate_2e])

        gate_3s = np.array([250, 350]) if small else np.array([500, 700])
        gate_3e = np.array([300, 350]) if small else np.array([600, 700])

        gate_3 = np.array([gate_3s, gate_3e])

        gate_4s = np.array([150, 425]) if small else np.array([300, 850])
        gate_4e = np.array([200, 425]) if small else np.array([400, 850])

        gate_4 = np.array([gate_4s, gate_4e])

        self._gate_list = gate_1, gate_2, gate_3, gate_4

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        self._out_reward = -10000
        self.correct_order = False

        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteeringMultiGate, self).__init__(mdp_info)
예제 #3
0
    def __init__(self,
                 start=None,
                 goal=None,
                 goal_threshold=.1,
                 noise_step=.025,
                 noise_reward=0,
                 reward_goal=0.,
                 thrust=.05,
                 puddle_center=None,
                 puddle_width=None,
                 gamma=.99,
                 horizon=5000):
        """
        Constructor.

        Args:
            start (np.array, None): starting position of the agent;
            goal (np.array, None): goal position;
            goal_threshold (float, .1): distance threshold of the agent from the
                goal to consider it reached;
            noise_step (float, .025): noise in actions;
            noise_reward (float, 0): standard deviation of gaussian noise in reward;
            reward_goal (float, 0): reward obtained reaching goal state;
            thrust (float, .05): distance walked during each action;
            puddle_center (np.array, None): center of the puddle;
            puddle_width (np.array, None): width of the puddle;

        """
        # MDP parameters
        self._start = np.array([.2, .4]) if start is None else start
        self._goal = np.array([1., 1.]) if goal is None else goal
        self._goal_threshold = goal_threshold
        self._noise_step = noise_step
        self._noise_reward = noise_reward
        self._reward_goal = reward_goal
        self._thrust = thrust
        puddle_center = [[.3, .6], [.4, .5], [.8, .9]
                         ] if puddle_center is None else puddle_center
        self._puddle_center = [np.array(center) for center in puddle_center]
        puddle_width = [[.1, .03], [.03, .1], [.03, .1]
                        ] if puddle_width is None else puddle_width
        self._puddle_width = [np.array(width) for width in puddle_width]

        self._actions = [np.zeros(2) for _ in range(5)]
        for i in range(4):
            self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1)

        # MDP properties
        action_space = Discrete(5)
        observation_space = Box(0., 1., shape=(2, ))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._pixels = None
        self._viewer = Viewer(1.0, 1.0)

        super().__init__(mdp_info)
예제 #4
0
    def __init__(self,
                 m=2.,
                 M=8.,
                 l=.5,
                 g=9.8,
                 mu=1e-2,
                 max_u=50.,
                 noise_u=10.,
                 horizon=3000,
                 gamma=.95):
        """
        Constructor.

        Args:
            m (float, 2.0): mass of the pendulum;
            M (float, 8.0): mass of the cart;
            l (float, .5): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 50.): maximum allowed input torque;
            noise_u (float, 10.): maximum noise on the action;
            horizon (int, 3000): horizon of the problem;
            gamma (int, .95): discount factor.

        """
        # MDP parameters
        self._m = m
        self._M = M
        self._l = l
        self._g = g
        self._alpha = 1 / (self._m + self._M)
        self._mu = mu
        self._dt = .1
        self._max_u = max_u
        self._noise_u = noise_u
        high = np.array([np.inf, np.inf])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None
        self._state = None

        super().__init__(mdp_info)
예제 #5
0
    def __init__(self,
                 random_start=False,
                 m=1.,
                 l=1.,
                 g=9.8,
                 mu=1e-2,
                 max_u=5.,
                 horizon=5000,
                 gamma=.99):
        """
        Constructor.

        Args:
            random_start (bool, False): whether to start from a random position
                or from the horizontal one;
            m (float, 1.0): mass of the pendulum;
            l (float, 1.0): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 5.0): maximum allowed input torque;
            horizon (int, 5000): horizon of the problem;
            gamma (int, .99): discount factor.

        """
        # MDP parameters
        self._m = m
        self._l = l
        self._g = g
        self._mu = mu
        self._random = random_start
        self._dt = .01
        self._max_u = max_u
        self._max_omega = 5 / 2 * np.pi
        high = np.array([np.pi, self._max_omega])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-max_u]),
                                  high=np.array([max_u]))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)
예제 #6
0
    def __init__(self, random_start=False, goal_distance=1.0):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.99

        self.Mr = 0.3 * 2
        self.Mp = 2.55
        self.Ip = 2.6e-2
        self.Ir = 4.54e-4 * 2
        self.l = 13.8e-2
        self.r = 5.5e-2
        self.dt = 1e-2
        self.g = 9.81
        self.max_u = 5

        self._random = random_start
        self._goal_distance = goal_distance

        high = np.array([2 * self._goal_distance, np.pi, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self.max_u]),
                                  high=np.array([self.max_u]))
        horizon = 1500
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        env_width = 4 * goal_distance
        env_height = 2.5 * 2 * self.l
        width = 800
        height = int(width * env_height / env_width)

        self._viewer = Viewer(env_width, env_height, width, height)

        super(SegwayLinearMotion, self).__init__(mdp_info)
예제 #7
0
    def __init__(self, small=True, n_steps_action=3):
        """
        Constructor.

        Args:
             small (bool, True): whether to use a small state space or not.
             n_steps_action (int, 3): number of integration intervals for each
                                      step of the mdp.

        """
        # MDP parameters
        self.field_size = 150 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self._gate_s = np.empty(2)
        self._gate_e = np.empty(2)
        self._gate_s[0] = 100 if small else 350
        self._gate_s[1] = 120 if small else 400
        self._gate_e[0] = 120 if small else 450
        self._gate_e[1] = 100 if small else 400
        self._out_reward = -100
        self._success_reward = 0
        self._small = small
        self._state = None
        self.n_steps_action = n_steps_action

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteering, self).__init__(mdp_info)
예제 #8
0
    def __init__(self, random_start=False):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.97

        self._Mr = 0.3 * 2
        self._Mp = 2.55
        self._Ip = 2.6e-2
        self._Ir = 4.54e-4 * 2
        self._l = 13.8e-2
        self._r = 5.5e-2
        self._dt = 1e-2
        self._g = 9.81
        self._max_u = 5

        self._random = random_start

        high = np.array([-np.pi / 2, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self._max_u]),
                                  high=np.array([self._max_u]))
        horizon = 300
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(5 * self._l, 5 * self._l)
        self._last_x = 0

        super(Segway, self).__init__(mdp_info)
예제 #9
0
    def __init__(self, horizon=100, gamma=.95):
        """
        Constructor.

        """
        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1.
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(1, 1)

        super().__init__(mdp_info)
예제 #10
0
    def __init__(self, random_start=False):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.97

        self._Mr = 0.3*2
        self._Mp = 2.55
        self._Ip = 2.6e-2
        self._Ir = 4.54e-4*2
        self._l = 13.8e-2
        self._r = 5.5e-2
        self._dt = 1e-2
        self._g = 9.81
        self._max_u = 5

        self._random = random_start

        high = np.array([-np.pi/2, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self._max_u]),
                                  high=np.array([self._max_u]))
        horizon = 300
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(5*self._l, 5*self._l)
        self._last_x = 0

        super(Segway, self).__init__(mdp_info)
예제 #11
0
    def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10.,
                 horizon=3000):
        """
        Constructor.

        Args:
            m (float, 2.0): mass of the pendulum;
            M (float, 8.0): mass of the cart;
            l (float, .5): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 50.): maximum allowed input torque;
            noise_u (float, 10.): maximum noise on the action;
            horizon (int, 3000): horizon of the problem.

        """
        # MDP parameters
        self._m = m
        self._M = M
        self._l = l
        self._g = g
        self._alpha = 1 / (self._m + self._M)
        self._mu = mu
        self._dt = .1
        self._max_u = max_u
        self._noise_u = noise_u
        high = np.array([np.inf, np.inf])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)
예제 #12
0
    def __init__(self, random_start=False, m=1.0, l=1.0, g=9.8, mu=1e-2,
                 max_u=5.0, horizon=5000):
        """
        Constructor.

        Args:
            random_start (bool, False): whether to start from a random position
                or from the horizontal one;
            m (float, 1.0): mass of the pendulum;
            l (float, 1.0): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 5.0): maximum allowed input torque;
            horizon (int, 5000): horizon of the problem.

        """
        # MDP parameters
        self._m = m
        self._l = l
        self._g = g
        self._mu = mu
        self._random = random_start
        self._dt = 0.01
        self._max_u = max_u
        self._max_omega = 5 / 2 * np.pi
        high = np.array([np.pi, self._max_omega])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-max_u]),
                                  high=np.array([max_u]))
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)
예제 #13
0
class ShipSteering(Environment):
    """
    The Ship Steering environment as presented in:
    "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S..
    2013.

    """
    def __init__(self, small=True, n_steps_action=3):
        """
        Constructor.

        Args:
             small (bool, True): whether to use a small state space or not.
             n_steps_action (int, 3): number of integration intervals for each
                                      step of the mdp.

        """
        # MDP parameters
        self.field_size = 150 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self._gate_s = np.empty(2)
        self._gate_e = np.empty(2)
        self._gate_s[0] = 100 if small else 350
        self._gate_s[1] = 120 if small else 400
        self._gate_e[0] = 120 if small else 450
        self._gate_e[1] = 100 if small else 400
        self._out_reward = -100
        self._success_reward = 0
        self._small = small
        self._state = None
        self.n_steps_action = n_steps_action

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteering, self).__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._small:
                self._state = np.zeros(4)
                self._state[2] = np.pi / 2
            else:
                low = self.info.observation_space.low
                high = self.info.observation_space.high
                self._state = (high - low) * np.random.rand(4) + low
        else:
            self._state = state

        return self._state

    def step(self, action):

        r = self._bound(action[0], -self.omega_max, self.omega_max)

        new_state = self._state

        for _ in range(self.n_steps_action):
            state = new_state
            new_state = np.empty(4)
            new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt
            new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt
            new_state[2] = normalize_angle(state[2] + state[3] * self._dt)
            new_state[3] = state[3] + (r - state[3]) * self._dt / self._T

            if new_state[0] > self.field_size \
               or new_state[1] > self.field_size \
               or new_state[0] < 0 or new_state[1] < 0:

                new_state[0] = self._bound(new_state[0], 0, self.field_size)
                new_state[1] = self._bound(new_state[1], 0, self.field_size)

                reward = self._out_reward
                absorbing = True
                break

            elif self._through_gate(state[:2], new_state[:2]):
                reward = self._success_reward
                absorbing = True
                break
            else:
                reward = -1
                absorbing = False

        self._state = new_state

        return self._state, reward, absorbing, {}

    def render(self, mode='human'):
        self._viewer.line(self._gate_s, self._gate_e, width=3)

        boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]]
        self._viewer.polygon(self._state[:2],
                             self._state[2],
                             boat,
                             color=(32, 193, 54))

        self._viewer.display(self._dt)

    def stop(self):
        self._viewer.close()

    def _through_gate(self, start, end):
        r = self._gate_e - self._gate_s
        s = end - start
        den = self._cross_2d(vecr=r, vecs=s)

        if den == 0:
            return False

        t = self._cross_2d((start - self._gate_s), s) / den
        u = self._cross_2d((start - self._gate_s), r) / den

        return 1 >= u >= 0 and 1 >= t >= 0

    @staticmethod
    def _cross_2d(vecr, vecs):
        return vecr[0] * vecs[1] - vecr[1] * vecs[0]
예제 #14
0
class CarOnHill(Environment):
    """
    The Car On Hill environment as presented in:
    "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005.

    """
    def __init__(self, horizon=100, gamma=.95):
        """
        Constructor.

        """
        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1.
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(1, 1)

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            self._state = np.array([-0.5, 0])
        else:
            self._state = state

        return self._state

    def step(self, action):
        action = self._discrete_actions[action[0]]
        sa = np.append(self._state, action)
        new_state = odeint(self._dpds, sa, [0, self._dt])

        self._state = new_state[-1, :-1]

        if self._state[0] < -self.max_pos or \
                np.abs(self._state[1]) > self.max_velocity:
            reward = -1
            absorbing = True
        elif self._state[0] > self.max_pos and \
                np.abs(self._state[1]) <= self.max_velocity:
            reward = 1
            absorbing = True
        else:
            reward = 0
            absorbing = False

        return self._state, reward, absorbing, {}

    def render(self):
        # Slope
        self._viewer.function(0, 1, self._height)

        # Car
        car_body = [[-3e-2, 0], [-3e-2, 2e-2], [-2e-2, 2e-2], [-1e-2, 3e-2],
                    [1e-2, 3e-2], [2e-2, 2e-2], [3e-2, 2e-2], [3e-2, 0]]

        x_car = (self._state[0] + 1) / 2
        y_car = self._height(x_car)
        c_car = [x_car, y_car]
        angle = self._angle(x_car)
        self._viewer.polygon(c_car, angle, car_body, color=(32, 193, 54))

        self._viewer.display(self._dt)

    @staticmethod
    def _angle(x):
        if x < 0.5:
            m = 4 * x - 1
        else:
            m = 1 / ((20 * x**2 - 20 * x + 6)**1.5)

        return np.arctan(m)

    @staticmethod
    def _height(x):
        y_neg = 4 * x**2 - 2 * x
        y_pos = (2 * x - 1) / np.sqrt(5 * (2 * x - 1)**2 + 1)
        y = np.zeros_like(x)

        mask = x < .5
        neg_mask = np.logical_not(mask)
        y[mask] = y_neg[mask]
        y[neg_mask] = y_pos[neg_mask]

        y_norm = (y + 1) / 2

        return y_norm

    def _dpds(self, state_action, t):
        position = state_action[0]
        velocity = state_action[1]
        u = state_action[-1]

        if position < 0.:
            diff_hill = 2 * position + 1
            diff_2_hill = 2
        else:
            diff_hill = 1 / ((1 + 5 * position**2)**1.5)
            diff_2_hill = (-15 * position) / ((1 + 5 * position**2)**2.5)

        dp = velocity
        ds = (u - self._g * self._m * diff_hill - velocity**2 * self._m *
              diff_hill * diff_2_hill) / (self._m * (1 + diff_hill**2))

        return dp, ds, 0.
예제 #15
0
class Segway(Environment):
    """
    The Segway environment (continuous version) as presented in:
    "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015.
    """
    def __init__(self, random_start=False):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.97

        self._Mr = 0.3*2
        self._Mp = 2.55
        self._Ip = 2.6e-2
        self._Ir = 4.54e-4*2
        self._l = 13.8e-2
        self._r = 5.5e-2
        self._dt = 1e-2
        self._g = 9.81
        self._max_u = 5

        self._random = random_start

        high = np.array([-np.pi/2, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self._max_u]),
                                  high=np.array([self._max_u]))
        horizon = 300
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(5*self._l, 5*self._l)
        self._last_x = 0

        super(Segway, self).__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._random:
                angle = np.random.uniform(-np.pi/2, np.pi/2)
            else:
                angle = -np.pi/8

            self._state = np.array([angle, 0., 0.])
        else:
            self._state = state
            self._state[0] = normalize_angle(self._state[0])

        self._last_x = 0

        return self._state

    def step(self, action):
        u = self._bound(action[0], -self._max_u, self._max_u)
        new_state = odeint(self._dynamics, self._state, [0, self._dt],
                           (u,))

        self._state = np.array(new_state[-1])
        self._state[0] = normalize_angle(self._state[0])

        if abs(self._state[0]) > np.pi / 2:
            absorbing = True
            reward = -10000
        else:
            absorbing = False
            Q = np.diag([3.0, 0.1, 0.1])

            x = self._state

            J = x.dot(Q).dot(x)

            reward = -J

        return self._state, reward, absorbing, {}

    def _dynamics(self, state, t, u):

        alpha = state[0]
        d_alpha = state[1]

        h1 = (self._Mr + self._Mp) * (self._r ** 2) + self._Ir
        h2 = self._Mp * self._r * self._l * np.cos(alpha)
        h3 = self._l ** 2 * self._Mp + self._Ip

        omegaP = d_alpha

        dOmegaP = -(h2*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 -
                    self._g*h1*self._l*self._Mp*np.sin(alpha) + (h2 + h1)*u)\
                  / (h1*h3-h2**2)
        dOmegaR = (h3*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 -
                   self._g*h2*self._l*self._Mp*np.sin(alpha) + (h3 + h2)*u)\
                  / (h1*h3-h2**2)

        dx = list()
        dx.append(omegaP)
        dx.append(dOmegaP)
        dx.append(dOmegaR)

        return dx

    def render(self, mode='human'):
        start = 2.5 * self._l * np.ones(2)
        end = 2.5 * self._l * np.ones(2)

        dx = -self._state[2] * self._r * self._dt

        self._last_x += dx

        if self._last_x > 2.5*self._l or self._last_x < -2.5*self._l:
            self._last_x = (2.5*self._l + self._last_x)%(5*self._l) \
                           - 2.5*self._l

        start[0] += self._last_x
        end[0] += -2*self._l*np.sin(self._state[0]) + self._last_x
        end[1] += 2*self._l*np.cos(self._state[0])

        if (start[0] > 5 * self._l and end[0] > 5 * self._l) \
                or (start[0] < 0 and end[0] < 0):
            start[0] = start[0]%5*self._l
            end[0] = end[0]%5*self._l

        self._viewer.line(start, end)
        self._viewer.circle(start, self._r)

        self._viewer.display(self._dt)
예제 #16
0
class Segway(Environment):
    """
    The Segway environment (continuous version) as presented in:
    "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015.
    """
    def __init__(self, random_start=False):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.97

        self._Mr = 0.3 * 2
        self._Mp = 2.55
        self._Ip = 2.6e-2
        self._Ir = 4.54e-4 * 2
        self._l = 13.8e-2
        self._r = 5.5e-2
        self._dt = 1e-2
        self._g = 9.81
        self._max_u = 5

        self._random = random_start

        high = np.array([-np.pi / 2, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self._max_u]),
                                  high=np.array([self._max_u]))
        horizon = 300
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(5 * self._l, 5 * self._l)
        self._last_x = 0

        super(Segway, self).__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._random:
                angle = np.random.uniform(-np.pi / 2, np.pi / 2)
            else:
                angle = -np.pi / 8

            self._state = np.array([angle, 0., 0.])
        else:
            self._state = state
            self._state[0] = normalize_angle(self._state[0])

        self._last_x = 0

        return self._state

    def step(self, action):
        u = self._bound(action[0], -self._max_u, self._max_u)
        new_state = odeint(self._dynamics, self._state, [0, self._dt], (u, ))

        self._state = np.array(new_state[-1])
        self._state[0] = normalize_angle(self._state[0])

        if abs(self._state[0]) > np.pi / 2:
            absorbing = True
            reward = -10000
        else:
            absorbing = False
            Q = np.diag([3.0, 0.1, 0.1])

            x = self._state

            J = x.dot(Q).dot(x)

            reward = -J

        return self._state, reward, absorbing, {}

    def _dynamics(self, state, t, u):

        alpha = state[0]
        d_alpha = state[1]

        h1 = (self._Mr + self._Mp) * (self._r**2) + self._Ir
        h2 = self._Mp * self._r * self._l * np.cos(alpha)
        h3 = self._l**2 * self._Mp + self._Ip

        omegaP = d_alpha

        dOmegaP = -(h2*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 -
                    self._g*h1*self._l*self._Mp*np.sin(alpha) + (h2 + h1)*u)\
                  / (h1*h3-h2**2)
        dOmegaR = (h3*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 -
                   self._g*h2*self._l*self._Mp*np.sin(alpha) + (h3 + h2)*u)\
                  / (h1*h3-h2**2)

        dx = list()
        dx.append(omegaP)
        dx.append(dOmegaP)
        dx.append(dOmegaR)

        return dx

    def render(self, mode='human'):
        start = 2.5 * self._l * np.ones(2)
        end = 2.5 * self._l * np.ones(2)

        dx = -self._state[2] * self._r * self._dt

        self._last_x += dx

        if self._last_x > 2.5 * self._l or self._last_x < -2.5 * self._l:
            self._last_x = (2.5*self._l + self._last_x)%(5*self._l) \
                           - 2.5*self._l

        start[0] += self._last_x
        end[0] += -2 * self._l * np.sin(self._state[0]) + self._last_x
        end[1] += 2 * self._l * np.cos(self._state[0])

        if (start[0] > 5 * self._l and end[0] > 5 * self._l) \
                or (start[0] < 0 and end[0] < 0):
            start[0] = start[0] % 5 * self._l
            end[0] = end[0] % 5 * self._l

        self._viewer.line(start, end)
        self._viewer.circle(start, self._r)

        self._viewer.display(self._dt)
예제 #17
0
class SegwayLinearMotion(Environment):
    """
    The Segway environment (continuous version) as presented in:
    "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015.
    """
    def __init__(self, random_start=False, goal_distance=1.0):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.99

        self.Mr = 0.3 * 2
        self.Mp = 2.55
        self.Ip = 2.6e-2
        self.Ir = 4.54e-4 * 2
        self.l = 13.8e-2
        self.r = 5.5e-2
        self.dt = 1e-2
        self.g = 9.81
        self.max_u = 5

        self._random = random_start
        self._goal_distance = goal_distance

        high = np.array([2 * self._goal_distance, np.pi, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self.max_u]),
                                  high=np.array([self.max_u]))
        horizon = 1500
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        env_width = 4 * goal_distance
        env_height = 2.5 * 2 * self.l
        width = 800
        height = int(width * env_height / env_width)

        self._viewer = Viewer(env_width, env_height, width, height)

        super(SegwayLinearMotion, self).__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._random:
                angle = np.random.uniform(-np.pi / 4, np.pi / 4)
            else:
                angle = -np.pi / 8

            self._state = np.array([-self._goal_distance, angle, 0., 0.])
        else:
            self._state = state
            self._state[1] = normalize_angle(self._state[1])

        return self._state

    def step(self, action):

        u = np.maximum(-self.max_u, np.minimum(self.max_u, action[0]))
        new_state = odeint(self._dynamics, self._state, [0, self.dt], (u, ))
        self._state = np.array(new_state[-1])
        self._state[1] = normalize_angle(self._state[1])

        if abs(self._state[1]) > np.pi / 2 \
                or abs(self._state[0]) > 2*self._goal_distance:
            absorbing = True
            reward = -10000
        else:
            absorbing = False
            Q = np.diag([10.0, 3.0, 0.1, 0.1])

            x = self._state
            J = x.dot(Q).dot(x)
            reward = -J

        return self._state, reward, absorbing, {}

    def _dynamics(self, state, t, u):

        position = state[0]

        alpha = state[1]
        d_alpha = state[2]
        d_beta = state[3]

        h1 = (self.Mr + self.Mp) * (self.r**2) + self.Ir
        h2 = self.Mp * self.r * self.l * np.cos(alpha)
        h3 = self.l**2 * self.Mp + self.Ip

        omegaP = d_alpha
        omegaR = d_beta

        velocity = -omegaR * self.r

        dOmegaP = -(h2*self.l*self.Mp*self.r*np.sin(alpha)*omegaP**2 -
                    self.g*h1*self.l*self.Mp*np.sin(alpha) + (h2+h1)*u)\
                  / (h1*h3-h2**2)
        dOmegaR = (h3*self.l*self.Mp*self.r*np.sin(alpha)*omegaP**2 -
                   self.g*h2*self.l*self.Mp*np.sin(alpha) + (h3+h2)*u)\
                  / (h1*h3-h2**2)

        dx = list()
        dx.append(velocity)
        dx.append(omegaP)
        dx.append(dOmegaP)
        dx.append(dOmegaR)

        return dx

    def render(self, mode='human'):
        start = np.array([2 * self._goal_distance, 1.25 * 2 * self.l])
        end = np.array(start)

        goal = start + np.array([0, -self.r])
        position = self._state[0]
        start[0] += position
        end[0] += -2 * self.l * np.sin(self._state[1]) + position
        end[1] += 2 * self.l * np.cos(self._state[1])

        self._viewer.line(start, end)
        self._viewer.circle(start, self.r)

        self._viewer.circle(goal, radius=0.01, color=(255, 0, 0))

        self._viewer.display(self.dt)
예제 #18
0
class AbstractGridWorld(Environment):
    """
    Abstract class to build a grid world.

    """
    def __init__(self, mdp_info, height, width, start, goal):
        """
        Constructor.

        Args:
            height (int): height of the grid;
            width (int): width of the grid;
            start (tuple): x-y coordinates of the goal;
            goal (tuple): x-y coordinates of the goal.

        """
        assert not np.array_equal(start, goal)

        assert goal[0] < height and goal[1] < width,\
            'Goal position not suitable for the grid world dimension.'

        self._state = None
        self._height = height
        self._width = width
        self._start = start
        self._goal = goal

        # Visualization
        self._viewer = Viewer(self._width, self._height, 500,
                              self._height * 500 // self._width)

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            state = self.convert_to_int(self._start, self._width)

        self._state = state

        return self._state

    def step(self, action):
        state = self.convert_to_grid(self._state, self._width)

        new_state, reward, absorbing, info = self._step(state, action)
        self._state = self.convert_to_int(new_state, self._width)

        return self._state, reward, absorbing, info

    def render(self):
        for row in range(1, self._height):
            for col in range(1, self._width):
                self._viewer.line(np.array([col, 0]),
                                  np.array([col, self._height]))
                self._viewer.line(np.array([0, row]),
                                  np.array([self._width, row]))

        goal_center = np.array(
            [.5 + self._goal[1], self._height - (.5 + self._goal[0])])
        self._viewer.square(goal_center, 0, 1, (0, 255, 0))

        start_grid = self.convert_to_grid(self._start, self._width)
        start_center = np.array(
            [.5 + start_grid[1], self._height - (.5 + start_grid[0])])
        self._viewer.square(start_center, 0, 1, (255, 0, 0))

        state_grid = self.convert_to_grid(self._state, self._width)
        state_center = np.array(
            [.5 + state_grid[1], self._height - (.5 + state_grid[0])])
        self._viewer.circle(state_center, .4, (0, 0, 255))

        self._viewer.display(.1)

    def _step(self, state, action):
        raise NotImplementedError('AbstractGridWorld is an abstract class.')

    @staticmethod
    def convert_to_grid(state, width):
        return np.array([state[0] // width, state[0] % width])

    @staticmethod
    def convert_to_int(state, width):
        return np.array([state[0] * width + state[1]])
    def __init__(self):
        self._rotation_radius = 0.6
        self._catch_radius = 0.4

        self._v_prey = 0.13
        self._v_predator = 0.1
        self._dt = 0.1

        self._omega_prey = self._v_prey / self._rotation_radius
        self._omega_predator = self._v_predator / self._rotation_radius

        self._max_x = 5.0
        self._max_y = 5.0

        self._obstacles = [
            (np.array([self._max_x/5,
                       self._max_y - 3.8*self._catch_radius]),
             np.array([self._max_x,
                       self._max_y - 3.8*self._catch_radius])),

            (np.array([-3/5*self._max_x,
                       self._max_y/4]),
             np.array([-3/5*self._max_x,
                       -3/10*self._max_y])),

            (np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       self._max_y / 4]),
             np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       -3/10*self._max_y])),

            (np.array([-3/5*self._max_x,
                       self._max_y/4]),
             np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       self._max_y/4]))
            ]

        # Add bounds of the map
        self._obstacles += [(np.array([-self._max_x, -self._max_y]),
                             np.array([-self._max_x, self._max_y])),

                            (np.array([-self._max_x, -self._max_y]),
                            np.array([self._max_x, -self._max_y])),

                            (np.array([self._max_x, self._max_y]),
                             np.array([-self._max_x, self._max_y])),

                            (np.array([self._max_x, self._max_y]),
                             np.array([self._max_x, -self._max_y]))
                            ]

        high = np.array([self._max_x, self._max_y, np.pi,
                         self._max_x, self._max_y, np.pi])


        # MDP properties
        horizon = 500
        gamma = 0.99

        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([0,
                                                -self._omega_predator]),
                                  high=np.array([self._v_predator,
                                                 self._omega_predator]))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        width = 500
        height = int(width * self._max_y / self._max_x)
        self._viewer = Viewer(2*self._max_x, 2*self._max_y, width, height)

        super(PreyPredator, self).__init__(mdp_info)
class PreyPredator(Environment):
    """
    A prey-predator environment environment. A Predator must catch a faster prey
    in an environment with obstacles.
    """
    def __init__(self):
        self._rotation_radius = 0.6
        self._catch_radius = 0.4

        self._v_prey = 0.13
        self._v_predator = 0.1
        self._dt = 0.1

        self._omega_prey = self._v_prey / self._rotation_radius
        self._omega_predator = self._v_predator / self._rotation_radius

        self._max_x = 5.0
        self._max_y = 5.0

        self._obstacles = [
            (np.array([self._max_x/5,
                       self._max_y - 3.8*self._catch_radius]),
             np.array([self._max_x,
                       self._max_y - 3.8*self._catch_radius])),

            (np.array([-3/5*self._max_x,
                       self._max_y/4]),
             np.array([-3/5*self._max_x,
                       -3/10*self._max_y])),

            (np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       self._max_y / 4]),
             np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       -3/10*self._max_y])),

            (np.array([-3/5*self._max_x,
                       self._max_y/4]),
             np.array([-3/5*self._max_x + 3.8*self._catch_radius,
                       self._max_y/4]))
            ]

        # Add bounds of the map
        self._obstacles += [(np.array([-self._max_x, -self._max_y]),
                             np.array([-self._max_x, self._max_y])),

                            (np.array([-self._max_x, -self._max_y]),
                            np.array([self._max_x, -self._max_y])),

                            (np.array([self._max_x, self._max_y]),
                             np.array([-self._max_x, self._max_y])),

                            (np.array([self._max_x, self._max_y]),
                             np.array([self._max_x, -self._max_y]))
                            ]

        high = np.array([self._max_x, self._max_y, np.pi,
                         self._max_x, self._max_y, np.pi])


        # MDP properties
        horizon = 500
        gamma = 0.99

        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([0,
                                                -self._omega_predator]),
                                  high=np.array([self._v_predator,
                                                 self._omega_predator]))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        width = 500
        height = int(width * self._max_y / self._max_x)
        self._viewer = Viewer(2*self._max_x, 2*self._max_y, width, height)

        super(PreyPredator, self).__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            self._state = np.array([0., 0., 0.,
                                    self._max_x/2, self._max_y/2, np.pi/2])
            self._state = np.array([3., 1., np.pi/2,
                                    3., 2., np.pi/2])

            ok = False

            while not ok:
                self._state = np.random.uniform(
                    low=self.info.observation_space.low,
                    high=self.info.observation_space.high)

                delta_norm = np.linalg.norm(self._state[:2] - self._state[3:5])
                ok = delta_norm > self._catch_radius

        else:
            self._state = state
            self._state[2] = normalize_angle(self._state[2])
            self._state[5] = normalize_angle(self._state[5])

        return self._state

    def step(self, action):
        # compute new predator state
        u = self._bound(action,
                        self.info.action_space.low,
                        self.info.action_space.high)

        state_predator = self._state[:3]
        state_predator = self._differential_drive_dynamics(state_predator, u)

        # Compute new prey state
        u_prey = self._prey_controller(self._state)
        state_prey = self._state[3:]
        state_prey = self._differential_drive_dynamics(state_prey, u_prey)

        # Update state
        self._state = np.concatenate([state_predator, state_prey], 0)

        delta_norm_new = np.linalg.norm(self._state[:2]-self._state[3:5])

        if delta_norm_new < self._catch_radius:
            collision, _ = self._check_collision(self._state[:2],
                                                 self._state[3:5])
            if collision is None:
                absorbing = True
            else:
                absorbing = False
        else:
            absorbing = False
        reward = -delta_norm_new

        return self._state, reward, absorbing, {}

    def _prey_controller(self, state):
        delta_norm = np.linalg.norm(state[:2] - state[3:5])

        if delta_norm > 3.0:
            velocity_prey = 0
        elif delta_norm > 1.5:
            velocity_prey = self._v_prey / 2
        else:
            velocity_prey = self._v_prey

        attack_angle = normalize_angle(np.arctan2(state[4] - state[1],
                                                  state[3] - state[0]))

        angle_current = shortest_angular_distance(state[5], attack_angle)

        if velocity_prey > 0:
            # check attack angle collision
            cos_theta = np.cos(attack_angle)
            sin_theta = np.sin(attack_angle)
            increment = 2.5*self._rotation_radius*np.array([cos_theta,
                                                            sin_theta])

            collision, i = self._check_collision(state[3:5],
                                                 state[3:5]+increment)

            if collision is not None:
                obstacle = self._obstacles[i]

                v_obst = self._segment_to_vector(*obstacle)
                v_attack = state[3:5] - state[0:2]

                angle = self._vector_angle(v_obst, v_attack)

                if 0 <= angle <= np.pi/2 or angle <= -np.pi/2:
                    rotation_sign = +1
                else:
                    rotation_sign = -1

                evasion_angle = attack_angle + rotation_sign * np.pi/2
                angle_current = shortest_angular_distance(state[5],
                                                          evasion_angle)

                alpha = normalize_angle(state[5] + rotation_sign*np.pi/2)

                cos_alpha = np.cos(alpha)
                sin_alpha = np.sin(alpha)
                increment = 1.5*self._rotation_radius * np.array(
                    [cos_alpha, sin_alpha])

                lateral_collision, _ = self._check_collision(state[3:5],
                                                             state[3:5] +
                                                             increment)

                if lateral_collision is not None:

                    rotation_sign *= -1
                    angle_current = rotation_sign * np.pi

        omega_prey = angle_current / np.pi

        u_prey = np.empty(2)
        u_prey[0] = self._bound(velocity_prey, 0, self._v_prey)
        u_prey[1] = self._bound(omega_prey, -self._omega_prey, self._omega_prey)

        return u_prey

    @staticmethod
    def _cross_2d(vecr, vecs):
        return vecr[0] * vecs[1] - vecr[1] * vecs[0]

    @staticmethod
    def _segment_to_vector(*segment):
        a = segment[0]
        b = segment[1]

        if (b[0] > a[0]) or (b[0] == a[0] and b[1] > a[1]):
            return b - a
        else:
            return a - b

    @staticmethod
    def _vector_angle(x, y):
        angle = np.arctan2(x[1], x[0]) - np.arctan2(y[1], y[0])

        return normalize_angle(angle)

    def _check_collision(self, start, end):
        collision = None
        index = None

        min_u = np.inf

        for i, obstacle in enumerate(self._obstacles):
            r = obstacle[1] - obstacle[0]
            s = end - start
            den = self._cross_2d(vecr=r, vecs=s)

            if den != 0:
                t = self._cross_2d((start - obstacle[0]), s) / den
                u = self._cross_2d((start - obstacle[0]), r) / den

                if 1 >= u >= 0 and 1 >= t >= 0:

                    if u < min_u:
                        collision = start + (u-1e-2)*s
                        min_u = u
                        index = i

        return collision, index

    def _differential_drive_dynamics(self, state, u):
        delta = np.empty(3)

        delta[0] = np.cos(state[2]) * u[0]
        delta[1] = np.sin(state[2]) * u[0]
        delta[2] = u[1]

        new_state = state + delta

        collision, _ = self._check_collision(state[:2], new_state[:2])

        if collision is not None:
            new_state[:2] = collision

        new_state[0] = self._bound(new_state[0], -self._max_x, self._max_x)
        new_state[1] = self._bound(new_state[1], -self._max_y, self._max_y)
        new_state[2] = normalize_angle(new_state[2])

        return new_state

    def render(self, mode='human'):
        center = np.array([self._max_x, self._max_y])

        predator_pos = self._state[:2]
        predator_theta = self._state[2]

        prey_pos = self._state[3:5]
        prey_theta = self._state[5]


        # Predator
        self._viewer.circle(center + predator_pos, self._catch_radius,
                            (255, 255, 255))
        self._viewer.arrow_head(center + predator_pos, self._catch_radius,
                                predator_theta, (255, 0, 0))

        # Prey
        self._viewer.arrow_head(center + prey_pos, self._catch_radius,
                                prey_theta, (0, 0, 255))

        # Obstacles
        for obstacle in self._obstacles:
            start = obstacle[0]
            end = obstacle[1]
            self._viewer.line(center + start, center + end)

        self._viewer.display(self._dt)
예제 #21
0
class PuddleWorld(Environment):
    """
    Puddle world as presented in:
    "Off-Policy Actor-Critic". Degris T. et al.. 2012.

    """
    def __init__(self,
                 start=None,
                 goal=None,
                 goal_threshold=.1,
                 noise_step=.025,
                 noise_reward=0,
                 reward_goal=0.,
                 thrust=.05,
                 puddle_center=None,
                 puddle_width=None,
                 gamma=.99,
                 horizon=5000):
        """
        Constructor.

        Args:
            start (np.array, None): starting position of the agent;
            goal (np.array, None): goal position;
            goal_threshold (float, .1): distance threshold of the agent from the
                goal to consider it reached;
            noise_step (float, .025): noise in actions;
            noise_reward (float, 0): standard deviation of gaussian noise in reward;
            reward_goal (float, 0): reward obtained reaching goal state;
            thrust (float, .05): distance walked during each action;
            puddle_center (np.array, None): center of the puddle;
            puddle_width (np.array, None): width of the puddle;

        """
        # MDP parameters
        self._start = np.array([.2, .4]) if start is None else start
        self._goal = np.array([1., 1.]) if goal is None else goal
        self._goal_threshold = goal_threshold
        self._noise_step = noise_step
        self._noise_reward = noise_reward
        self._reward_goal = reward_goal
        self._thrust = thrust
        puddle_center = [[.3, .6], [.4, .5], [.8, .9]
                         ] if puddle_center is None else puddle_center
        self._puddle_center = [np.array(center) for center in puddle_center]
        puddle_width = [[.1, .03], [.03, .1], [.03, .1]
                        ] if puddle_width is None else puddle_width
        self._puddle_width = [np.array(width) for width in puddle_width]

        self._actions = [np.zeros(2) for _ in range(5)]
        for i in range(4):
            self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1)

        # MDP properties
        action_space = Discrete(5)
        observation_space = Box(0., 1., shape=(2, ))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._pixels = None
        self._viewer = Viewer(1.0, 1.0)

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            self._state = self._start.copy()
        else:
            self._state = state

        return self._state

    def step(self, action):
        idx = action[0]
        self._state += self._actions[idx] + np.random.uniform(
            low=-self._noise_step, high=self._noise_step, size=(2, ))
        self._state = np.clip(self._state, 0., 1.)

        absorbing = np.linalg.norm(
            (self._state - self._goal), ord=1) < self._goal_threshold

        if not absorbing:
            reward = np.random.randn() * self._noise_reward + self._get_reward(
                self._state)
        else:
            reward = self._reward_goal

        return self._state, reward, absorbing, {}

    def render(self):
        if self._pixels is None:
            img_size = 100
            pixels = np.zeros((img_size, img_size, 3))
            for i in range(img_size):
                for j in range(img_size):
                    x = i / img_size
                    y = j / img_size
                    pixels[i, img_size - 1 - j] = self._get_reward(
                        np.array([x, y]))

            pixels -= pixels.min()
            pixels *= 255. / pixels.max()
            self._pixels = np.floor(255 - pixels)

        self._viewer.background_image(self._pixels)
        self._viewer.circle(self._state, 0.01, color=(0, 255, 0))

        goal_area = [[-self._goal_threshold, 0], [0, self._goal_threshold],
                     [self._goal_threshold, 0], [0, -self._goal_threshold]]
        self._viewer.polygon(self._goal,
                             0,
                             goal_area,
                             color=(255, 0, 0),
                             width=1)

        self._viewer.display(0.1)

    def stop(self):
        if self._viewer is not None:
            self._viewer.close()

    def _get_reward(self, state):
        reward = -1.
        for cen, wid in zip(self._puddle_center, self._puddle_width):
            reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf(
                state[1], cen[1], wid[1])

        return reward
예제 #22
0
class InvertedPendulumDiscrete(Environment):
    """
    The Inverted Pendulum environment as presented in:
    "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003.

    """
    def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10.,
                 horizon=3000):
        """
        Constructor.

        Args:
            m (float, 2.0): mass of the pendulum;
            M (float, 8.0): mass of the cart;
            l (float, .5): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 50.): maximum allowed input torque;
            noise_u (float, 10.): maximum noise on the action;
            horizon (int, 3000): horizon of the problem.

        """
        # MDP parameters
        self._m = m
        self._M = M
        self._l = l
        self._g = g
        self._alpha = 1 / (self._m + self._M)
        self._mu = mu
        self._dt = .1
        self._max_u = max_u
        self._noise_u = noise_u
        high = np.array([np.inf, np.inf])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            angle = np.random.uniform(-np.pi / 8., np.pi / 8.)

            self._state = np.array([angle, 0.])
        else:
            self._state = state
            self._state[0] = normalize_angle(self._state[0])

        return self._state

    def step(self, action):
        if action == 0:
            u = -self._max_u
        elif action == 1:
            u = 0.
        else:
            u = self._max_u
        u += np.random.uniform(-self._noise_u, self._noise_u)
        new_state = odeint(self._dynamics, self._state, [0, self._dt],
                           (u,))

        self._state = np.array(new_state[-1])
        self._state[0] = normalize_angle(self._state[0])

        if np.abs(self._state[0]) > np.pi * .5:
            reward = -1.
            absorbing = True
        else:
            reward = 0.
            absorbing = False

        self._last_u = u

        return self._state, reward, absorbing, {}

    def render(self, mode='human'):
        start = 1.25 * self._l * np.ones(2)
        end = 1.25 * self._l * np.ones(2)

        end[0] += self._l * np.sin(self._state[0])
        end[1] += self._l * np.cos(self._state[0])

        self._viewer.line(start, end)
        self._viewer.circle(start, self._l / 40)
        self._viewer.circle(end, self._l / 20)
        self._viewer.torque_arrow(start, -self._last_u, self._max_u,
                                  self._l / 5)

        self._viewer.display(self._dt)

    def stop(self):
        self._viewer.close()

    def _dynamics(self, state, t, u):
        theta = state[0]
        omega = state[1]

        d_theta = omega
        d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l *
                   d_theta ** 2 * np.sin(2 * theta) * .5 - self._alpha * np.cos(
                    theta) * u) / (4 / 3 * self._l - self._alpha * self._m *
                                   self._l * np.cos(theta) ** 2)

        return d_theta, d_omega
예제 #23
0
class InvertedPendulum(Environment):
    """
    The Inverted Pendulum environment (continuous version) as presented in:
    "Reinforcement Learning In Continuous Time and Space". Doya K.. 2000.
    "Off-Policy Actor-Critic". Degris T. et al.. 2012.
    "Deterministic Policy Gradient Algorithms". Silver D. et al. 2014.

    """
    def __init__(self, random_start=False, m=1.0, l=1.0, g=9.8, mu=1e-2,
                 max_u=5.0, horizon=5000):
        """
        Constructor.

        Args:
            random_start (bool, False): whether to start from a random position
                or from the horizontal one;
            m (float, 1.0): mass of the pendulum;
            l (float, 1.0): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 5.0): maximum allowed input torque;
            horizon (int, 5000): horizon of the problem.

        """
        # MDP parameters
        self._m = m
        self._l = l
        self._g = g
        self._mu = mu
        self._random = random_start
        self._dt = 0.01
        self._max_u = max_u
        self._max_omega = 5 / 2 * np.pi
        high = np.array([np.pi, self._max_omega])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-max_u]),
                                  high=np.array([max_u]))
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._random:
                angle = np.random.uniform(-np.pi, np.pi)
            else:
                angle = np.pi / 2

            self._state = np.array([angle, 0.])
        else:
            self._state = state
            self._state[0] = normalize_angle(self._state[0])
            self._state[1] = self._bound(self._state[1], -self._max_omega,
                                         self._max_omega)

        return self._state

    def step(self, action):
        u = self._bound(action[0], -self._max_u, self._max_u)
        new_state = odeint(self._dynamics, self._state, [0, self._dt],
                           (u,))

        self._state = np.array(new_state[-1])
        self._state[0] = normalize_angle(self._state[0])
        self._state[1] = self._bound(self._state[1], -self._max_omega,
                                     self._max_omega)

        reward = np.cos(self._state[0])

        self._last_u = u

        return self._state, reward, False, {}

    def render(self, mode='human'):
        start = 1.25 * self._l * np.ones(2)
        end = 1.25 * self._l * np.ones(2)

        end[0] += self._l * np.sin(self._state[0])
        end[1] += self._l * np.cos(self._state[0])

        self._viewer.line(start, end)
        self._viewer.circle(start, self._l / 40)
        self._viewer.circle(end, self._l / 20)
        self._viewer.torque_arrow(start, -self._last_u, self._max_u,
                                  self._l / 5)

        self._viewer.display(self._dt)

    def stop(self):
        self._viewer.close()

    def _dynamics(self, state, t, u):
        theta = state[0]
        omega = self._bound(state[1], -self._max_omega, self._max_omega)

        d_theta = omega
        d_omega = (-self._mu * omega + self._m * self._g * self._l * np.sin(
            theta) + u) / (self._m * self._l**2)

        return d_theta, d_omega
예제 #24
0
class ShipSteeringMultiGate(Environment):
    """
    The Ship Steering environment as presented in:
    "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S..
    2013 with multiple gates.

    """
    def __init__(self, n_steps_action=3, viz_speed=100, small=False):

        self.__name__ = 'ShipSteeringMultiGate'
        self.n_steps_action = n_steps_action
        self.viz_speed = viz_speed

        # MDP parameters
        self.no_of_gates = 4

        self.small = small

        self.field_size = 500 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12., 0])
        high = np.array([
            self.field_size, self.field_size, np.pi, np.pi / 12.,
            self.no_of_gates
        ])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2

        gate_1s = np.array([75, 175]) if small else np.array([150, 350])
        gate_1e = np.array([125, 175]) if small else np.array([250, 350])

        gate_1 = np.array([gate_1s, gate_1e])

        gate_2s = np.array([150, 300]) if small else np.array([300, 600])
        gate_2e = np.array([200, 300]) if small else np.array([400, 600])

        gate_2 = np.array([gate_2s, gate_2e])

        gate_3s = np.array([250, 350]) if small else np.array([500, 700])
        gate_3e = np.array([300, 350]) if small else np.array([600, 700])

        gate_3 = np.array([gate_3s, gate_3e])

        gate_4s = np.array([150, 425]) if small else np.array([300, 850])
        gate_4e = np.array([200, 425]) if small else np.array([400, 850])

        gate_4 = np.array([gate_4s, gate_4e])

        self._gate_list = gate_1, gate_2, gate_3, gate_4

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        self._out_reward = -10000
        self.correct_order = False

        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteeringMultiGate, self).__init__(mdp_info)

    def reset(self, state=None):

        if state is None:
            self._state = np.zeros(8)
            #self._state[0] = 500
            #self._state[1] = 500
        else:
            self._state = state

        return self._state

    def step(self, action):

        r = np.maximum(-self.omega_max, np.minimum(self.omega_max, action[0]))
        new_state = self._state

        for _ in range(self.n_steps_action):

            state = new_state

            new_state = np.empty(8)

            new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt
            new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt
            new_state[2] = normalize_angle(state[2] + state[3] * self._dt)
            new_state[3] = state[3] + (r - state[3]) * self._dt / self._T
            new_state[4:] = state[4:]
            absorbing = False

            reward = 0

            if new_state[0] > self.field_size \
               or new_state[1] > self.field_size \
               or new_state[0] < 0 or new_state[1] < 0:
                reward = self._out_reward
                absorbing = True
                break
            else:
                for i, gate in enumerate(self._gate_list):
                    if self._through_gate(state[:2], new_state[:2], gate):
                        new_state[4 + i] += 1
                        if new_state[4 + i] == 1:
                            reward = 10

            if np.all(new_state[5:] > 0):
                absorbing = True
                break

        self._state = new_state

        return self._state, reward, absorbing, {}

    def _through_gate(self, start, end, gate):
        gate_e = gate[1]
        gate_s = gate[0]

        r = gate_e - gate_s
        s = end - start
        den = self._cross_2d(vecr=r, vecs=s)

        if den == 0:
            return False

        t = self._cross_2d((start - gate_s), s) / den
        u = self._cross_2d((start - gate_s), r) / den

        return 1 >= u >= 0 and 1 >= t >= 0

    @staticmethod
    def _cross_2d(vecr, vecs):
        return vecr[0] * vecs[1] - vecr[1] * vecs[0]

    def render(self, mode='human'):

        for gate in self._gate_list:
            gate_s = gate[0]
            gate_e = gate[1]
            self._viewer.line(gate_s, gate_e, width=3)

        boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]]
        self._viewer.polygon(self._state[:2],
                             self._state[2],
                             boat,
                             color=(250, 55, 54))

        self._viewer.display(self._dt / self.viz_speed)