示例#1
0
    def step(self, action):
        """See `AcrobotEnv.step()'."""
        s = self.state
        torque = self.clip(action, -self.max_torque, self.max_torque)

        # Add noise to the force action
        if self.torque_noise_max > 0:
            torque += self.rand(-self.torque_noise_max, self.torque_noise_max)

        # Now, augment the state with our force action so it can be passed to
        # _dsdt
        s_augmented = self.cat((s, torque), -1)

        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
        # only care about final timestep of integration returned by integrator
        ns = ns[-1]
        ns = ns[..., :4]  # omit action

        ns[..., 0] = angle_normalize(ns[..., 0])
        ns[..., 1] = angle_normalize(ns[..., 1])
        ns[..., 2] = self.clip(ns[..., 2], -self.MAX_VEL_1, self.MAX_VEL_1)
        ns[..., 3] = self.clip(ns[..., 3], -self.MAX_VEL_2, self.MAX_VEL_2)
        self.state = ns
        terminal = self._terminal()

        reward = -self.bk.ones(terminal.shape) * (~terminal)
        return self._get_ob(), reward, terminal, {}
示例#2
0
    def step(self, action):
        """See `PendulumEnv.step()'."""
        g = self.g
        m = self.m
        length = self.l
        inertia = m * length**2
        bk = self.bk
        dt = self.dt
        theta, theta_dot = self.state[..., 0], self.state[..., 1]

        u = self.clip(action, -self.max_torque, self.max_torque)[..., 0]

        if not u.shape:
            self.last_u = u  # for rendering
        costs = angle_normalize(theta)**2 + 0.1 * theta_dot**2 + 0.001 * (u**2)

        theta_d_dot = -3 * g / (2 * length) * bk.sin(theta +
                                                     np.pi) + 3.0 / inertia * u
        new_theta_dot = theta_dot + theta_d_dot * dt
        new_theta = theta + new_theta_dot * dt
        new_theta_dot = self.clip(new_theta_dot, -self.max_speed,
                                  self.max_speed)

        self.state = self.bk.stack((new_theta, new_theta_dot), -1)

        done = bk.zeros_like(costs, dtype=bk.bool)
        return self._get_obs(), -costs, done, {}
示例#3
0
    def reward_sparse(self, th, thdot, u):
        angle = np.degrees(angle_normalize(th))
        done = False
        reward = 0

        if self.check_angle_speed_limit(angle, thdot):
            reward = self.max_speed - (np.absolute(thdot) / 6.0)

        return reward, done
示例#4
0
    def reward_binary(self, th, thdot, u):
        angle = np.degrees(angle_normalize(th))
        done = False
        reward = 0

        if not self.check_angle_speed_limit(angle, thdot):
            SparsePendulumRBFEnv.balance_counter = 0
            return reward, done

        SparsePendulumRBFEnv.balance_counter += 1

        if SparsePendulumRBFEnv.balance_counter >= self.balance_counter:
            reward = 1

        if not self.cumulative:
            done = True

        return reward, done
示例#5
0
    def step(self, u):
        th, thdot = self.state  # th := theta
        g = self.g
        m = 1.
        l = 1.
        dt = self.dt

        u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = angle_normalize(th)**2 + .1 * thdot**2 + .001 * (u**2)

        newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. /
                            (m * l**2) * u) * dt
        newth = th + newthdot * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)  #pylint: disable=E1111

        self.state = np.array([newth, newthdot])
        return self._get_obs(), -costs, False, {}
示例#6
0
    def step(self, action):
        # get state before executing action for feature calculation:
        th, thdot = self.realenv.state

        # execute action
        next_state, reward, terminated, info = self.env.step(action)

        # obtain features used for reward function:
        action = np.clip(action, -self.realenv.max_torque,
                         self.realenv.max_torque)[0]
        info['features'] = np.array(
            [angle_normalize(th)**2, thdot**2, action**2])

        assert np.abs(
            reward -
            np.dot(info['features'], np.array([-1, -.1, -.001]))) < 1e-10

        return next_state, reward, terminated, info
示例#7
0
    def simulate(self, u, env):
        from gym.envs.classic_control.pendulum import angle_normalize
        th, thdot = env.state  # th := theta
        g = 10.
        m = 1.0
        l = 1.
        dt = env.dt
        env.scale_action = 100.0
        u = np.clip(u, -env.max_torque, env.max_torque)[0]

        env.last_u = u  # for rendering
        costs = angle_normalize(th)**2 + .1 * thdot**2 + .001 * (u**2)

        newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. /
                            (m * l**2) * u) * dt
        newth = th + newthdot * dt
        newthdot = np.clip(newthdot, -env.max_speed, env.max_speed)  #pylint: disable=E1111

        env.state = np.array([newth, newthdot])
        return env._get_obs(), -costs, False, {}
示例#8
0
 def _step_reward(self, observation: tf.Tensor, action: tf.Tensor,
                  next_observation: tf.Tensor) -> tf.Tensor:
     angles = tf.math.atan2(observation[:, 1], observation[:, 0])
     angles = angle_normalize(angles)
     return -(angles**2 + 0.1 * observation[:, 2]**2 + 0.001 * action**2)[0]
示例#9
0
 def features(self, current_state, action, next_state):
     th, thdot = utils.utils.unwrap_env(self.env.env, PendulumEnv).state
     action = np.clip(action, -self.env.env.max_torque,
                      self.env.env.max_torque)[0]
     return np.array([angle_normalize(th)**2, thdot**2, action**2])
示例#10
0
 def state_non_sparse_reward(theta, omega):
     """Get sparse reward."""
     theta = angle_normalize(theta)
     return -(theta**2 + 0.1 * omega**2)