def compute_reward(self, achieved_goal, goal, info): reward_ctrl = -0.05 * np.square(self.action).sum() dist_to_end_location = np.linalg.norm( self.sim.data.get_site_xpos('gripperpalm') - self.end_location) reward_dist = tolerance(dist_to_end_location, margin=0.8, bounds=(0., 0.02), sigmoid='linear', value_at_margin=0.) reward = 0.25 * reward_dist if self.sim.data.get_site_xpos( 'tar')[2] < 0.1: # if z < 0.1, then drop out and restart self._restart_target() sparse_reward = 0. dist = np.linalg.norm( self.sim.data.get_site_xpos('gripperpalm') - self.sim.data.get_site_xpos('tar')) if dist < 0.05: reward += 20. sparse_reward += 10. self._restart_target() reward += reward_ctrl info = dict(scoring_reward=sparse_reward) return reward, False, info
def compute_reward(self, achieved_goal, goal, info): if not self.high_motion_penalty: reward_ctrl = - 0.05 * np.square(self.action).sum() else: reward_ctrl = - 0.075 * np.square(self.action).sum() dist = np.linalg.norm(self.sim.data.get_site_xpos('robot0:grip')[:2] - self.sim.data.get_site_xpos('tar')[:2]) reward_dist = tolerance(dist, margin=0.5, bounds=(0., 0.02), sigmoid='linear', value_at_margin=0.) reward = 0.2 * reward_dist + reward_ctrl done = False if self.sim.data.get_site_xpos('tar')[2] < 0.4: done = True reward = -1. sparse_reward = 0. if self.give_reflection_reward: sparse_reward = 1. self.give_reflection_reward = False reward += 0.2 * sparse_reward info = dict(scoring_reward=sparse_reward) return reward, done, info
def step(self, a): if self.no_movement: a = np.zeros([2], np.float32) self.n_total_steps += 1 dist = np.linalg.norm(self.get_body_com("fingertip") - self.get_body_com("target")) reward_dist = tolerance(dist, margin=0.3, bounds=(0., 0.009), sigmoid='cosine', value_at_margin=0.) reward_ctrl = - 0.1 * np.square(a).sum() reward = 0.25 * (reward_dist + reward_ctrl) self.do_simulation(a, self.frame_skip) # target wall reflection: if self.sim.data.qpos[2] < 0.028 or 0.207 < self.sim.data.qpos[2]: self.sim.data.qvel[2] = - self.sim.data.qvel[2] def restart(): initial_target_position = self.np_random.uniform(low=0.035, high=.185, size=2) # low=0.05, high=.2, size=2 initial_target_position[1] = .25 self.sim.data.qpos[-2:] = initial_target_position target_angle = self.np_random.uniform(low=1.0833 * np.pi, high=1.9167 * np.pi) target_velocity = self.target_velocity + \ self.np_random.uniform(low=-self.target_velocity_delta, high=self.target_velocity_delta) self.sim.data.qvel[2] = target_velocity * np.cos(target_angle) self.sim.data.qvel[3] = target_velocity * np.sin(target_angle) self.previous_target_position = copy(initial_target_position) if self.sim.data.qpos[3] < - 0.25: # failed catch (target gone) restart() sparse_reward = 0. if dist < 0.011: reward += 2 sparse_reward += 1. restart() ob = self._get_obs() done = False return ob, reward, done, dict(scoring_reward=sparse_reward)
def step(self, a): if self.no_movement: a = np.zeros([2], np.float32) if self.random_4_background and \ self.random_frequency != -1 and \ not self.n_total_steps % self.random_frequency: self._randomize_env() self.n_total_steps += 1 dist = np.linalg.norm( self.get_body_com("fingertip") - self.get_body_com("target")) reward_dist = tolerance(dist, margin=0.3, bounds=(0., 0.009), sigmoid='cosine', value_at_margin=0.) reward_ctrl = -0.1 * np.square(a).sum() reward = reward_dist + reward_ctrl sparse_reward = 0. if dist < 0.02: sparse_reward = 1. reward += sparse_reward self.do_simulation(a, self.frame_skip) # target wall reflection: if np.abs(self.sim.data.qpos[2]) > 0.205: self.sim.data.qvel[2] = -self.sim.data.qvel[2] if np.abs(self.sim.data.qpos[3]) > 0.205: self.sim.data.qvel[3] = -self.sim.data.qvel[3] ob = self._get_obs() done = False return ob, reward, done, dict(scoring_reward=sparse_reward)