def step(self, action): """See `AcrobotEnv.step()'.""" s = self.state torque = self.clip(action, -self.max_torque, self.max_torque) # Add noise to the force action if self.torque_noise_max > 0: torque += self.rand(-self.torque_noise_max, self.torque_noise_max) # Now, augment the state with our force action so it can be passed to # _dsdt s_augmented = self.cat((s, torque), -1) ns = rk4(self._dsdt, s_augmented, [0, self.dt]) # only care about final timestep of integration returned by integrator ns = ns[-1] ns = ns[..., :4] # omit action ns[..., 0] = angle_normalize(ns[..., 0]) ns[..., 1] = angle_normalize(ns[..., 1]) ns[..., 2] = self.clip(ns[..., 2], -self.MAX_VEL_1, self.MAX_VEL_1) ns[..., 3] = self.clip(ns[..., 3], -self.MAX_VEL_2, self.MAX_VEL_2) self.state = ns terminal = self._terminal() reward = -self.bk.ones(terminal.shape) * (~terminal) return self._get_ob(), reward, terminal, {}
def step(self, action): """See `PendulumEnv.step()'.""" g = self.g m = self.m length = self.l inertia = m * length**2 bk = self.bk dt = self.dt theta, theta_dot = self.state[..., 0], self.state[..., 1] u = self.clip(action, -self.max_torque, self.max_torque)[..., 0] if not u.shape: self.last_u = u # for rendering costs = angle_normalize(theta)**2 + 0.1 * theta_dot**2 + 0.001 * (u**2) theta_d_dot = -3 * g / (2 * length) * bk.sin(theta + np.pi) + 3.0 / inertia * u new_theta_dot = theta_dot + theta_d_dot * dt new_theta = theta + new_theta_dot * dt new_theta_dot = self.clip(new_theta_dot, -self.max_speed, self.max_speed) self.state = self.bk.stack((new_theta, new_theta_dot), -1) done = bk.zeros_like(costs, dtype=bk.bool) return self._get_obs(), -costs, done, {}
def reward_sparse(self, th, thdot, u): angle = np.degrees(angle_normalize(th)) done = False reward = 0 if self.check_angle_speed_limit(angle, thdot): reward = self.max_speed - (np.absolute(thdot) / 6.0) return reward, done
def reward_binary(self, th, thdot, u): angle = np.degrees(angle_normalize(th)) done = False reward = 0 if not self.check_angle_speed_limit(angle, thdot): SparsePendulumRBFEnv.balance_counter = 0 return reward, done SparsePendulumRBFEnv.balance_counter += 1 if SparsePendulumRBFEnv.balance_counter >= self.balance_counter: reward = 1 if not self.cumulative: done = True return reward, done
def step(self, u): th, thdot = self.state # th := theta g = self.g m = 1. l = 1. dt = self.dt u = np.clip(u, -self.max_torque, self.max_torque)[0] self.last_u = u # for rendering costs = angle_normalize(th)**2 + .1 * thdot**2 + .001 * (u**2) newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l**2) * u) * dt newth = th + newthdot * dt newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) #pylint: disable=E1111 self.state = np.array([newth, newthdot]) return self._get_obs(), -costs, False, {}
def step(self, action): # get state before executing action for feature calculation: th, thdot = self.realenv.state # execute action next_state, reward, terminated, info = self.env.step(action) # obtain features used for reward function: action = np.clip(action, -self.realenv.max_torque, self.realenv.max_torque)[0] info['features'] = np.array( [angle_normalize(th)**2, thdot**2, action**2]) assert np.abs( reward - np.dot(info['features'], np.array([-1, -.1, -.001]))) < 1e-10 return next_state, reward, terminated, info
def simulate(self, u, env): from gym.envs.classic_control.pendulum import angle_normalize th, thdot = env.state # th := theta g = 10. m = 1.0 l = 1. dt = env.dt env.scale_action = 100.0 u = np.clip(u, -env.max_torque, env.max_torque)[0] env.last_u = u # for rendering costs = angle_normalize(th)**2 + .1 * thdot**2 + .001 * (u**2) newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l**2) * u) * dt newth = th + newthdot * dt newthdot = np.clip(newthdot, -env.max_speed, env.max_speed) #pylint: disable=E1111 env.state = np.array([newth, newthdot]) return env._get_obs(), -costs, False, {}
def _step_reward(self, observation: tf.Tensor, action: tf.Tensor, next_observation: tf.Tensor) -> tf.Tensor: angles = tf.math.atan2(observation[:, 1], observation[:, 0]) angles = angle_normalize(angles) return -(angles**2 + 0.1 * observation[:, 2]**2 + 0.001 * action**2)[0]
def features(self, current_state, action, next_state): th, thdot = utils.utils.unwrap_env(self.env.env, PendulumEnv).state action = np.clip(action, -self.env.env.max_torque, self.env.env.max_torque)[0] return np.array([angle_normalize(th)**2, thdot**2, action**2])
def state_non_sparse_reward(theta, omega): """Get sparse reward.""" theta = angle_normalize(theta) return -(theta**2 + 0.1 * omega**2)