def compute_reward(self, achieved_goal, goal, info):
        reward_ctrl = -0.05 * np.square(self.action).sum()

        dist_to_end_location = np.linalg.norm(
            self.sim.data.get_site_xpos('gripperpalm') - self.end_location)
        reward_dist = tolerance(dist_to_end_location,
                                margin=0.8,
                                bounds=(0., 0.02),
                                sigmoid='linear',
                                value_at_margin=0.)

        reward = 0.25 * reward_dist

        if self.sim.data.get_site_xpos(
                'tar')[2] < 0.1:  # if z < 0.1, then drop out and restart
            self._restart_target()

        sparse_reward = 0.
        dist = np.linalg.norm(
            self.sim.data.get_site_xpos('gripperpalm') -
            self.sim.data.get_site_xpos('tar'))
        if dist < 0.05:
            reward += 20.
            sparse_reward += 10.
            self._restart_target()

        reward += reward_ctrl

        info = dict(scoring_reward=sparse_reward)

        return reward, False, info
示例#2
0
    def compute_reward(self, achieved_goal, goal, info):
        if not self.high_motion_penalty:
            reward_ctrl = - 0.05 * np.square(self.action).sum()
        else:
            reward_ctrl = - 0.075 * np.square(self.action).sum()


        dist = np.linalg.norm(self.sim.data.get_site_xpos('robot0:grip')[:2] -
                              self.sim.data.get_site_xpos('tar')[:2])
        reward_dist = tolerance(dist, margin=0.5, bounds=(0., 0.02),
                                sigmoid='linear',
                                value_at_margin=0.)

        reward = 0.2 * reward_dist + reward_ctrl

        done = False
        if self.sim.data.get_site_xpos('tar')[2] < 0.4:
            done = True
            reward = -1.

        sparse_reward = 0.
        if self.give_reflection_reward:
            sparse_reward = 1.
            self.give_reflection_reward = False

        reward += 0.2 * sparse_reward

        info = dict(scoring_reward=sparse_reward)

        return reward, done, info
示例#3
0
    def step(self, a):
        if self.no_movement:
            a = np.zeros([2], np.float32)

        self.n_total_steps += 1

        dist = np.linalg.norm(self.get_body_com("fingertip") - self.get_body_com("target"))
        reward_dist = tolerance(dist, margin=0.3, bounds=(0., 0.009),
                                sigmoid='cosine',
                                value_at_margin=0.)
        reward_ctrl = - 0.1 * np.square(a).sum()
        reward = 0.25 * (reward_dist + reward_ctrl)

        self.do_simulation(a, self.frame_skip)

        # target wall reflection:
        if self.sim.data.qpos[2] < 0.028 or 0.207 < self.sim.data.qpos[2]:
            self.sim.data.qvel[2] = - self.sim.data.qvel[2]

        def restart():
            initial_target_position = self.np_random.uniform(low=0.035, high=.185, size=2)
            # low=0.05, high=.2, size=2
            initial_target_position[1] = .25
            self.sim.data.qpos[-2:] = initial_target_position
            target_angle = self.np_random.uniform(low=1.0833 * np.pi, high=1.9167 * np.pi)
            target_velocity = self.target_velocity + \
                self.np_random.uniform(low=-self.target_velocity_delta,
                                       high=self.target_velocity_delta)
            self.sim.data.qvel[2] = target_velocity * np.cos(target_angle)
            self.sim.data.qvel[3] = target_velocity * np.sin(target_angle)
            self.previous_target_position = copy(initial_target_position)

        if self.sim.data.qpos[3] < - 0.25:  # failed catch (target gone)
            restart()

        sparse_reward = 0.
        if dist < 0.011:
            reward += 2
            sparse_reward += 1.
            restart()

        ob = self._get_obs()
        done = False
        return ob, reward, done, dict(scoring_reward=sparse_reward)
示例#4
0
    def step(self, a):
        if self.no_movement:
            a = np.zeros([2], np.float32)

        if self.random_4_background and \
                self.random_frequency != -1 and \
                not self.n_total_steps % self.random_frequency:
            self._randomize_env()

        self.n_total_steps += 1

        dist = np.linalg.norm(
            self.get_body_com("fingertip") - self.get_body_com("target"))
        reward_dist = tolerance(dist,
                                margin=0.3,
                                bounds=(0., 0.009),
                                sigmoid='cosine',
                                value_at_margin=0.)
        reward_ctrl = -0.1 * np.square(a).sum()
        reward = reward_dist + reward_ctrl
        sparse_reward = 0.
        if dist < 0.02:
            sparse_reward = 1.
            reward += sparse_reward

        self.do_simulation(a, self.frame_skip)

        # target wall reflection:
        if np.abs(self.sim.data.qpos[2]) > 0.205:
            self.sim.data.qvel[2] = -self.sim.data.qvel[2]
        if np.abs(self.sim.data.qpos[3]) > 0.205:
            self.sim.data.qvel[3] = -self.sim.data.qvel[3]

        ob = self._get_obs()
        done = False
        return ob, reward, done, dict(scoring_reward=sparse_reward)