def compute_reward(self, action, obs):
        obj = obs[4:7]
        gripper = self.tcp_center

        obj_to_target = np.linalg.norm(obj - self._target_pos)
        tcp_to_obj = np.linalg.norm(obj - gripper)
        in_place_margin = np.linalg.norm(self.obj_init_pos - self._target_pos)

        threshold = 0.03
        # floor is a 3D funnel centered on the initial object pos
        radius = np.linalg.norm(gripper[:2] - self.obj_init_pos[:2])
        if radius <= threshold:
            floor = 0.0
        else:
            floor = 0.015 * np.log(radius - threshold) + 0.15
        # prevent the hand from running into cliff edge by staying above floor
        above_floor = 1.0 if gripper[2] >= floor else reward_utils.tolerance(
            max(floor - gripper[2], 0.0),
            bounds=(0.0, 0.01),
            margin=0.02,
            sigmoid='long_tail',
        )
        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            object_reach_radius=0.01,
            obj_radius=0.015,
            pad_success_thresh=0.02,
            xz_thresh=0.03,
            desired_gripper_effort=0.1,
            high_density=True)
        in_place = reward_utils.tolerance(obj_to_target,
                                          bounds=(0, 0.02),
                                          margin=in_place_margin,
                                          sigmoid='long_tail')
        reward = reward_utils.hamacher_product(object_grasped, in_place)

        near_object = tcp_to_obj < 0.04
        pinched_without_obj = obs[3] < 0.33
        lifted = obj[2] - 0.02 > self.obj_init_pos[2]
        # Increase reward when properly grabbed obj
        grasp_success = near_object and lifted and not pinched_without_obj
        if grasp_success:
            reward += 1. + 5. * reward_utils.hamacher_product(
                in_place, above_floor)
        # Maximize reward on success
        if obj_to_target < self.TARGET_RADIUS:
            reward = 10.

        return (
            reward,
            tcp_to_obj,
            grasp_success,
            obj_to_target,
            object_grasped,
            in_place,
        )
Пример #2
0
    def compute_reward(self, action, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        obj = obs[4:7]
        tcp_opened = obs[3]
        target = self._target_pos

        obj_to_target = np.linalg.norm(obj - target)
        tcp_to_obj = np.linalg.norm(obj - tcp)
        in_place_margin = np.linalg.norm(self.obj_init_pos - target)

        in_place = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, _TARGET_RADIUS),
            margin=in_place_margin,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(action=action,
                                                     obj_pos=obj,
                                                     obj_radius=0.02,
                                                     pad_success_thresh=0.05,
                                                     object_reach_radius=0.01,
                                                     xz_thresh=0.01,
                                                     high_density=False)
        reward = reward_utils.hamacher_product(object_grasped, in_place)

        if (0.0 < obj[2] < 0.24 and \
                (target[0]-0.15 < obj[0] < target[0]+0.15) and \
                ((target[1] - 3*_TARGET_RADIUS) < obj[1] < target[1])):
            z_scaling = (0.24 - obj[2]) / 0.24
            y_scaling = (obj[1] -
                         (target[1] - 3 * _TARGET_RADIUS)) / (3 *
                                                              _TARGET_RADIUS)
            bound_loss = reward_utils.hamacher_product(y_scaling, z_scaling)
            in_place = np.clip(in_place - bound_loss, 0.0, 1.0)

        if ((0.0 < obj[2] < 0.24) and \
                (target[0]-0.15 < obj[0] < target[0]+0.15) and \
                (obj[1] > target[1])):
            in_place = 0.0

        if tcp_to_obj < 0.025 and (tcp_opened > 0) and \
                (obj[2] - 0.01 > self.obj_init_pos[2]):
            reward += 1. + 5. * in_place

        if obj_to_target < _TARGET_RADIUS:
            reward = 10.

        return [
            reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped,
            in_place
        ]
Пример #3
0
    def _gripper_caging_reward(self, action, obj_position):
        pad_success_margin = 0.05
        x_z_success_margin = 0.005
        obj_radius = 0.015
        tcp = self.tcp_center
        left_pad = self.get_body_com('leftpad')
        right_pad = self.get_body_com('rightpad')
        delta_object_y_left_pad = left_pad[1] - obj_position[1]
        delta_object_y_right_pad = obj_position[1] - right_pad[1]
        right_caging_margin = abs(abs(obj_position[1] - self.init_right_pad[1])
            - pad_success_margin)
        left_caging_margin = abs(abs(obj_position[1] - self.init_left_pad[1])
            - pad_success_margin)

        right_caging = reward_utils.tolerance(delta_object_y_right_pad,
                                bounds=(obj_radius, pad_success_margin),
                                margin=right_caging_margin,
                                sigmoid='long_tail',)
        left_caging = reward_utils.tolerance(delta_object_y_left_pad,
                                bounds=(obj_radius, pad_success_margin),
                                margin=left_caging_margin,
                                sigmoid='long_tail',)

        y_caging = reward_utils.hamacher_product(left_caging,
                                                 right_caging)

        # compute the tcp_obj distance in the x_z plane
        tcp_xz = tcp + np.array([0., -tcp[1], 0.])
        obj_position_x_z = np.copy(obj_position) + np.array([0., -obj_position[1], 0.])
        tcp_obj_norm_x_z = np.linalg.norm(tcp_xz - obj_position_x_z, ord=2)

        # used for computing the tcp to object object margin in the x_z plane
        init_obj_x_z = self.obj_init_pos + np.array([0., -self.obj_init_pos[1], 0.])
        init_tcp_x_z = self.init_tcp + np.array([0., -self.init_tcp[1], 0.])
        tcp_obj_x_z_margin = np.linalg.norm(init_obj_x_z - init_tcp_x_z, ord=2) - x_z_success_margin

        x_z_caging = reward_utils.tolerance(tcp_obj_norm_x_z,
                                bounds=(0, x_z_success_margin),
                                margin=tcp_obj_x_z_margin,
                                sigmoid='long_tail',)

        gripper_closed = min(max(0, action[-1]), 1)
        caging = reward_utils.hamacher_product(y_caging, x_z_caging)

        gripping = gripper_closed if caging > 0.97 else 0.
        caging_and_gripping = reward_utils.hamacher_product(caging,
                                                            gripping)
        caging_and_gripping = (caging_and_gripping + caging) / 2
        return caging_and_gripping
Пример #4
0
    def compute_reward(self, actions, obs):
        reward_grab = SawyerBoxCloseEnvV2._reward_grab_effort(actions)
        reward_quat = SawyerBoxCloseEnvV2._reward_quat(obs)
        reward_steps = SawyerBoxCloseEnvV2._reward_pos(obs, self._target_pos)

        reward = sum((
            2.0 * reward_utils.hamacher_product(reward_grab, reward_steps[0]),
            8.0 * reward_steps[1],
        ))

        # Override reward on success
        success = np.linalg.norm(obs[4:7] - self._target_pos) < 0.08
        if success:
            reward = 10.0

        # STRONG emphasis on proper lid orientation to prevent reward hacking
        # (otherwise agent learns to kick-flip the lid onto the box)
        reward *= reward_quat

        return (
            reward,
            reward_grab,
            *reward_steps,
            success,
        )
Пример #5
0
    def compute_reward(self, action, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        obj = obs[4:7]
        tcp_opened = obs[3]
        target = self._target_pos

        obj_to_target = np.linalg.norm(obj - target)
        tcp_to_obj = np.linalg.norm(obj - tcp)
        in_place_margin = (np.linalg.norm(self.obj_init_pos - target))

        in_place = reward_utils.tolerance(obj_to_target,
                                    bounds=(0, _TARGET_RADIUS),
                                    margin=in_place_margin,
                                    sigmoid='long_tail',)

        object_grasped = self._gripper_caging_reward(action, obj)
        in_place_and_object_grasped = reward_utils.hamacher_product(object_grasped,
                                                                    in_place)
        reward = in_place_and_object_grasped

        if tcp_to_obj < 0.02 and (tcp_opened > 0) and (obj[2] - 0.01 > self.obj_init_pos[2]):
            reward += 1. + 5. * in_place
        if obj_to_target < _TARGET_RADIUS:
            reward = 10.
        return [reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place]
Пример #6
0
    def compute_reward(self, action, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        obj = obs[4:7]
        tcp_opened = obs[3]
        target = np.array([self._target_pos[0], self._target_pos[1], obj[2]])

        obj_to_target = np.linalg.norm(obj - target)
        tcp_to_obj = np.linalg.norm(obj - tcp)
        in_place_margin = np.linalg.norm(self.obj_init_pos - target)

        in_place = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, _TARGET_RADIUS),
            margin=in_place_margin,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(action, obj,
                                                     self.OBJ_RADIUS)
        in_place_and_object_grasped = reward_utils.hamacher_product(
            object_grasped, in_place)

        reward = (2 * object_grasped) + (6 * in_place_and_object_grasped)

        if obj_to_target < _TARGET_RADIUS:
            reward = 10.
        return [
            reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped,
            in_place
        ]
Пример #7
0
    def compute_reward(self, action, obs):
        obj = obs[4:7]
        tcp_opened = obs[3]
        tcp_to_obj = np.linalg.norm(obj - self.tcp_center)
        target_to_obj = np.linalg.norm(obj - self._target_pos)
        target_to_obj_init = np.linalg.norm(self.obj_init_pos -
                                            self._target_pos)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=target_to_obj_init,
            sigmoid='long_tail',
        )
        object_grasped = self._gripper_caging_reward(action, obj,
                                                     self.OBJ_RADIUS)

        reward = reward_utils.hamacher_product(object_grasped, in_place)

        if (tcp_to_obj < 0.01) and (0 < tcp_opened < 0.55) and \
                (target_to_obj_init - target_to_obj > 0.01):
            reward += 1. + 5. * in_place
        if target_to_obj < self.TARGET_RADIUS:
            reward = 10.
        return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped,
                in_place)
    def compute_reward(self, action, obs):
        del action
        obj = obs[4:7]
        tcp = self.tcp_center

        tcp_to_obj = np.linalg.norm(obj - tcp)
        tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp)
        obj_to_target = abs(self._target_pos[2] - obj[2])

        tcp_closed = 1 - obs[3]
        near_button = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, 0.01),
            margin=tcp_to_obj_init,
            sigmoid='long_tail',
        )
        button_pressed = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, 0.005),
            margin=self._obj_to_target_init,
            sigmoid='long_tail',
        )

        reward = 5 * reward_utils.hamacher_product(tcp_closed, near_button)
        if tcp_to_obj <= 0.03:
            reward += 5 * button_pressed

        return (reward, tcp_to_obj, obs[3], obj_to_target, near_button,
                button_pressed)
Пример #9
0
    def compute_reward(self, action, obs):
        gripper = obs[:3]
        lever = obs[4:7]

        # De-emphasize y error so that we get Sawyer's shoulder underneath the
        # lever prior to bumping on against
        scale = np.array([4., 1., 4.])
        # Offset so that we get the Sawyer's shoulder underneath the lever,
        # rather than its fingers
        offset = np.array([.0, .055, .07])

        shoulder_to_lever = (gripper + offset - lever) * scale
        shoulder_to_lever_init = (self.init_tcp + offset -
                                  self._lever_pos_init) * scale

        # This `ready_to_lift` reward should be a *hint* for the agent, not an
        # end in itself. Make sure to devalue it compared to the value of
        # actually lifting the lever
        ready_to_lift = reward_utils.tolerance(
            np.linalg.norm(shoulder_to_lever),
            bounds=(0, 0.02),
            margin=np.linalg.norm(shoulder_to_lever_init),
            sigmoid='long_tail',
        )

        # The skill of the agent should be measured by its ability to get the
        # lever to point straight upward. This means we'll be measuring the
        # current angle of the lever's joint, and comparing with 90deg.
        lever_angle = -self.data.get_joint_qpos('LeverAxis')
        lever_angle_desired = np.pi / 2.0

        lever_error = abs(lever_angle - lever_angle_desired)

        # We'll set the margin to 15deg from horizontal. Angles below that will
        # receive some reward to incentivize exploration, but we don't want to
        # reward accidents too much. Past 15deg is probably intentional movement
        lever_engagement = reward_utils.tolerance(lever_error,
                                                  bounds=(0, np.pi / 48.0),
                                                  margin=(np.pi / 2.0) -
                                                  (np.pi / 12.0),
                                                  sigmoid='long_tail')

        target = self._target_pos
        obj_to_target = np.linalg.norm(lever - target)
        in_place_margin = (np.linalg.norm(self._lever_pos_init - target))

        in_place = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, 0.04),
            margin=in_place_margin,
            sigmoid='long_tail',
        )

        # reward = 2.0 * ready_to_lift + 8.0 * lever_engagement
        reward = 10.0 * reward_utils.hamacher_product(ready_to_lift, in_place)
        return (reward, np.linalg.norm(shoulder_to_lever), ready_to_lift,
                lever_error, lever_engagement)
Пример #10
0
    def compute_reward(self, action, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        obj = obs[4:7]
        tcp_opened = obs[3]
        midpoint = np.array([self._target_pos[0], 0.77, 0.25])
        target = self._target_pos

        tcp_to_obj = np.linalg.norm(obj - tcp)

        in_place_scaling = np.array([1., 1., 3.])
        obj_to_midpoint = np.linalg.norm((obj - midpoint) * in_place_scaling)
        obj_to_midpoint_init = np.linalg.norm(
            (self.obj_init_pos - midpoint) * in_place_scaling)

        obj_to_target = np.linalg.norm(obj - target)
        obj_to_target_init = np.linalg.norm(self.obj_init_pos - target)

        in_place_part1 = reward_utils.tolerance(
            obj_to_midpoint,
            bounds=(0, _TARGET_RADIUS),
            margin=obj_to_midpoint_init,
            sigmoid='long_tail',
        )

        in_place_part2 = reward_utils.tolerance(obj_to_target,
                                                bounds=(0, _TARGET_RADIUS),
                                                margin=obj_to_target_init,
                                                sigmoid='long_tail')

        object_grasped = self._gripper_caging_reward(action=action,
                                                     obj_pos=obj,
                                                     obj_radius=0.015,
                                                     pad_success_thresh=0.05,
                                                     object_reach_radius=0.01,
                                                     xz_thresh=0.005,
                                                     high_density=False)

        in_place_and_object_grasped = reward_utils.hamacher_product(
            object_grasped, in_place_part1)
        reward = in_place_and_object_grasped

        if tcp_to_obj < 0.02 and (tcp_opened > 0) and (obj[2] - 0.015 >
                                                       self.obj_init_pos[2]):
            reward = in_place_and_object_grasped + 1. + 4. * in_place_part1
            if obj[1] > 0.75:
                reward = in_place_and_object_grasped + 1. + 4. + 3. * in_place_part2

        if obj_to_target < _TARGET_RADIUS:
            reward = 10.

        return [
            reward, tcp_to_obj, tcp_opened,
            np.linalg.norm(obj - target), object_grasped, in_place_part2
        ]
Пример #11
0
    def compute_reward(self, action, obs):
        obj = self._get_pos_objects()
        dial_push_position = self._get_pos_objects() + np.array([0.05, 0.02, 0.09])
        tcp = self.tcp_center
        target = self._target_pos.copy()

        target_to_obj = (obj - target)
        target_to_obj = np.linalg.norm(target_to_obj)
        target_to_obj_init = (self.dial_push_position - target)
        target_to_obj_init = np.linalg.norm(target_to_obj_init)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=abs(target_to_obj_init - self.TARGET_RADIUS),
            sigmoid='long_tail',
        )

        dial_reach_radius = 0.005
        tcp_to_obj = np.linalg.norm(dial_push_position - tcp)
        tcp_to_obj_init = np.linalg.norm(self.dial_push_position - self.init_tcp)
        reach = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, dial_reach_radius),
            margin=abs(tcp_to_obj_init-dial_reach_radius),
            sigmoid='gaussian',
        )
        gripper_closed = min(max(0, action[-1]), 1)

        reach = reward_utils.hamacher_product(reach, gripper_closed)
        tcp_opened = 0
        object_grasped = reach

        reward = 10 * reward_utils.hamacher_product(reach, in_place)

        return (reward,
               tcp_to_obj,
               tcp_opened,
               target_to_obj,
               object_grasped,
               in_place)
Пример #12
0
    def compute_reward(self, action, obs):
        obj = obs[4:7]

        tcp = self.tcp_center
        target = self._target_pos.copy()

        target_to_obj = (obj - target)
        target_to_obj = np.linalg.norm(target_to_obj)
        target_to_obj_init = (self.obj_init_pos - target)
        target_to_obj_init = np.linalg.norm(target_to_obj_init)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=abs(target_to_obj_init - self.TARGET_RADIUS),
            sigmoid='long_tail',
        )

        handle_reach_radius = 0.005
        tcp_to_obj = np.linalg.norm(obj - tcp)
        tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp)
        reach = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, handle_reach_radius),
            margin=abs(tcp_to_obj_init - handle_reach_radius),
            sigmoid='gaussian',
        )
        gripper_closed = min(max(0, action[-1]), 1)

        reach = reward_utils.hamacher_product(reach, gripper_closed)
        tcp_opened = 0
        object_grasped = reach

        reward = reward_utils.hamacher_product(reach, in_place)
        if target_to_obj <= self.TARGET_RADIUS + 0.015:
            reward = 1.

        reward *= 10

        return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped,
                in_place)
Пример #13
0
    def compute_reward(self, action, obs):
        obj = obs[4:7]
        target = self._target_pos.copy()

        # Emphasize X and Y errors
        scale = np.array([2., 2., 1.])
        target_to_obj = (obj - target) * scale
        target_to_obj = np.linalg.norm(target_to_obj)
        target_to_obj_init = (self.obj_init_pos - target) * scale
        target_to_obj_init = np.linalg.norm(target_to_obj_init)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, 0.05),
            margin=target_to_obj_init,
            sigmoid='long_tail',
        )
        tcp_opened = obs[3]
        tcp_to_obj = np.linalg.norm(obj - self.tcp_center)

        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            object_reach_radius=0.04,
            obj_radius=0.02,
            pad_success_thresh=0.05,
            xz_thresh=0.05,
            desired_gripper_effort=0.7,
            medium_density=True
        )

        reward = reward_utils.hamacher_product(object_grasped, in_place)

        if tcp_to_obj < 0.04 and tcp_opened > 0:
            reward += 1. + 5. * in_place
        if target_to_obj < 0.05:
            reward = 10.
        return (
            reward,
            tcp_to_obj,
            tcp_opened,
            np.linalg.norm(obj - target),  # recompute to avoid `scale` above
            object_grasped,
            in_place
        )
Пример #14
0
    def compute_reward(self, action, obs):
        obj = obs[4:7]
        # Force target to be slightly above basketball hoop
        target = self._target_pos.copy()

        target_to_obj = abs(target[2] - obj[2])
        target_to_obj_init = abs(target[2] - self.obj_init_pos[2])

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=target_to_obj_init,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            pad_success_thresh=0.05,
            obj_radius=0.022,
            object_reach_radius=0.01,
            xz_thresh=0.01,
            high_density=True,
        )
        reward = reward_utils.hamacher_product(object_grasped, in_place)

        tcp_opened = obs[3]
        tcp_to_obj = np.linalg.norm(obj - self.tcp_center)

        if tcp_to_obj < 0.035 and tcp_opened > 0 and \
                obj[2] - 0.01 > self.obj_init_pos[2]:
            reward += 1. + 5. * in_place
        if target_to_obj < self.TARGET_RADIUS:
            reward = 10.
        return (
            reward,
            tcp_to_obj,
            tcp_opened,
            target_to_obj,
            object_grasped,
            in_place
        )
Пример #15
0
    def compute_reward(self, actions, obs):
        theta = self.data.get_joint_qpos('doorjoint')

        reward_grab = SawyerDoorEnvV2._reward_grab_effort(actions)
        reward_steps = SawyerDoorEnvV2._reward_pos(obs, theta)

        reward = sum((
            2.0 * reward_utils.hamacher_product(reward_steps[0], reward_grab),
            8.0 * reward_steps[1],
        ))

        # Override reward on success flag
        if abs(obs[4] - self._target_pos[0]) <= 0.08:
            reward = 10.0

        return (
            reward,
            reward_grab,
            *reward_steps,
        )
Пример #16
0
    def _reward_pos(obs, target_pos):
        hand = obs[:3]
        lid = obs[4:7] + np.array([.0, .0, .02])

        threshold = 0.02
        # floor is a 3D funnel centered on the lid's handle
        radius = np.linalg.norm(hand[:2] - lid[:2])
        if radius <= threshold:
            floor = 0.0
        else:
            floor = 0.04 * np.log(radius - threshold) + 0.4
        # prevent the hand from running into the handle prematurely by keeping
        # it above the "floor"
        above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance(
            floor - hand[2],
            bounds=(0.0, 0.01),
            margin=floor / 2.0,
            sigmoid='long_tail',
        )
        # grab the lid's handle
        in_place = reward_utils.tolerance(
            np.linalg.norm(hand - lid),
            bounds=(0, 0.02),
            margin=0.5,
            sigmoid='long_tail',
        )
        ready_to_lift = reward_utils.hamacher_product(above_floor, in_place)

        # now actually put the lid on the box
        pos_error = target_pos - lid
        error_scale = np.array([1., 1., 3.])  # Emphasize Z error
        a = 0.2  # Relative importance of just *trying* to lift the lid at all
        b = 0.8  # Relative importance of placing the lid on the box
        lifted = a * float(lid[2] > 0.04) + b * reward_utils.tolerance(
            np.linalg.norm(pos_error * error_scale),
            bounds=(0, 0.05),
            margin=0.25,
            sigmoid='long_tail',
        )

        return ready_to_lift, lifted
Пример #17
0
    def _reward_pos(obs, theta):
        hand = obs[:3]
        door = obs[4:7] + np.array([-0.05, 0, 0])

        threshold = 0.12
        # floor is a 3D funnel centered on the door handle
        radius = np.linalg.norm(hand[:2] - door[:2])
        if radius <= threshold:
            floor = 0.0
        else:
            floor = 0.04 * np.log(radius - threshold) + 0.4
        # prevent the hand from running into the handle prematurely by keeping
        # it above the "floor"
        above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance(
            floor - hand[2],
            bounds=(0.0, 0.01),
            margin=floor / 2.0,
            sigmoid='long_tail',
        )
        # move the hand to a position between the handle and the main door body
        in_place = reward_utils.tolerance(
            np.linalg.norm(hand - door - np.array([0.05, 0.03, -0.01])),
            bounds=(0, threshold / 2.0),
            margin=0.5,
            sigmoid='long_tail',
        )
        ready_to_open = reward_utils.hamacher_product(above_floor, in_place)

        # now actually open the door
        door_angle = -theta
        a = 0.2  # Relative importance of just *trying* to open the door at all
        b = 0.8  # Relative importance of fully opening the door
        opened = a * float(theta < -np.pi / 90.) + b * reward_utils.tolerance(
            np.pi / 2. + np.pi / 6 - door_angle,
            bounds=(0, 0.5),
            margin=np.pi / 3.,
            sigmoid='long_tail',
        )

        return ready_to_open, opened
    def compute_reward(self, action, obs):
        del action
        obj = obs[4:7]
        tcp = self.tcp_center

        tcp_to_obj = np.linalg.norm(obj - tcp)
        tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp)
        obj_to_target = abs(self._target_pos[1] - obj[1])

        near_button = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, 0.01),
            margin=tcp_to_obj_init,
            sigmoid='long_tail',
        )
        button_pressed = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, 0.005),
            margin=self._obj_to_target_init,
            sigmoid='long_tail',
        )

        reward = 0.0
        if tcp_to_obj > 0.07:
            tcp_status = (1 - obs[3]) / 2.0
            reward = 2 * reward_utils.hamacher_product(tcp_status, near_button)
        else:
            reward = 2
            reward += 2 * (1 + obs[3])
            reward += 4 * button_pressed ** 2

        return (
            reward,
            tcp_to_obj,
            obs[3],
            obj_to_target,
            near_button,
            button_pressed
        )
Пример #19
0
    def compute_reward(self, action, obs):
        obj = obs[4:7]

        target_to_obj = np.linalg.norm(obj - self._target_pos)
        target_to_obj_init = np.linalg.norm(self.obj_init_pos - self._target_pos)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=target_to_obj_init,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            object_reach_radius=0.01,
            obj_radius=0.015,
            pad_success_thresh=0.05,
            xz_thresh=0.005,
            high_density=True
        )
        reward = reward_utils.hamacher_product(object_grasped, in_place)

        tcp_opened = obs[3]
        tcp_to_obj = np.linalg.norm(obj - self.tcp_center)

        if tcp_to_obj < 0.02 and tcp_opened > 0:
            reward += 1. + 7. * in_place
        if target_to_obj < self.TARGET_RADIUS:
            reward = 10.
        return (
            reward,
            tcp_to_obj,
            tcp_opened,
            target_to_obj,
            object_grasped,
            in_place
        )
Пример #20
0
    def compute_reward(self, actions, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        obj = obs[4:7]
        tcp_opened = obs[3]
        target = self._target_pos

        obj_to_target = np.linalg.norm(obj - target)
        in_place_margin = np.linalg.norm(self.obj_init_pos - target)
        in_place = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, _TARGET_RADIUS),
            margin=in_place_margin - _TARGET_RADIUS,
            sigmoid='long_tail',
        )

        tcp_to_obj = np.linalg.norm(tcp - obj)
        obj_grasped_margin = np.linalg.norm(self.init_tcp - self.obj_init_pos)
        object_grasped = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, _TARGET_RADIUS),
            margin=obj_grasped_margin - _TARGET_RADIUS,
            sigmoid='long_tail',
        )

        in_place_and_object_grasped = reward_utils.hamacher_product(
            object_grasped, in_place)
        reward = 1.5 * object_grasped

        if tcp[2] <= 0.03 and tcp_to_obj < 0.07:
            reward = 2 + (7 * in_place)

        if obj_to_target < _TARGET_RADIUS:
            reward = 10.
        return [
            reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped,
            in_place
        ]
Пример #21
0
    def compute_reward(self, actions, obs):
        del actions

        objPos = obs[4:7]
        obj = self._get_pos_objects()
        tcp = self.tcp_center
        target = self._target_pos.copy()

        target_to_obj = (obj[2] - target[2])
        target_to_obj = np.linalg.norm(target_to_obj)
        target_to_obj_init = (self._handle_init_pos[2] - target[2])
        target_to_obj_init = np.linalg.norm(target_to_obj_init)

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=abs(target_to_obj_init - self.TARGET_RADIUS),
            sigmoid='long_tail',
        )

        handle_radius = 0.02
        tcp_to_obj = np.linalg.norm(obj - tcp)
        tcp_to_obj_init = np.linalg.norm(self._handle_init_pos - self.init_tcp)
        reach = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, handle_radius),
            margin=abs(tcp_to_obj_init - handle_radius),
            sigmoid='long_tail',
        )
        tcp_opened = 0
        object_grasped = reach

        reward = reward_utils.hamacher_product(reach, in_place)
        reward = 1 if target_to_obj <= self.TARGET_RADIUS else reward
        reward *= 10
        return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped,
                in_place)
Пример #22
0
    def compute_reward(self, action, obs):
        del action
        obj = obs[4:7]
        tcp = self.get_body_com('leftpad')

        scale = np.array([0.25, 1., 0.5])
        tcp_to_obj = np.linalg.norm((obj - tcp) * scale)
        tcp_to_obj_init = np.linalg.norm((obj - self.init_left_pad) * scale)

        obj_to_target = abs(self._target_pos[2] - obj[2])

        tcp_opened = max(obs[3], 0.0)
        near_lock = reward_utils.tolerance(
            tcp_to_obj,
            bounds=(0, 0.01),
            margin=tcp_to_obj_init,
            sigmoid='long_tail',
        )
        lock_pressed = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, 0.005),
            margin=self._lock_length,
            sigmoid='long_tail',
        )

        reward = 2 * reward_utils.hamacher_product(tcp_opened, near_lock)
        reward += 8 * lock_pressed

        return (
            reward,
            tcp_to_obj,
            obs[3],
            obj_to_target,
            near_lock,
            lock_pressed
        )
Пример #23
0
    def compute_reward(self, action, obs):
        hand = obs[:3]
        obj = obs[4:7]

        target_to_obj = np.linalg.norm(obj - self._target_pos)
        if self._target_to_obj_init is None:
            self._target_to_obj_init = target_to_obj

        in_place = reward_utils.tolerance(
            target_to_obj,
            bounds=(0, self.TARGET_RADIUS),
            margin=self._target_to_obj_init,
            sigmoid='long_tail',
        )

        threshold = 0.03
        radii = [
            np.linalg.norm(hand[:2] - self.obj_init_pos[:2]),
            np.linalg.norm(hand[:2] - self._target_pos[:2])
        ]
        # floor is a *pair* of 3D funnels centered on (1) the object's initial
        # position and (2) the desired final position
        floor = min([
            0.02 * np.log(radius - threshold) +
            0.2 if radius > threshold else 0.0 for radius in radii
        ])
        # prevent the hand from running into the edge of the bins by keeping
        # it above the "floor"
        above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance(
            max(floor - hand[2], 0.0),
            bounds=(0.0, 0.01),
            margin=0.05,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            obj_radius=0.015,
            pad_success_thresh=0.05,
            object_reach_radius=0.01,
            xz_thresh=0.01,
            desired_gripper_effort=0.7,
            high_density=True,
        )
        reward = reward_utils.hamacher_product(object_grasped, in_place)

        near_object = np.linalg.norm(obj - hand) < 0.04
        pinched_without_obj = obs[3] < 0.43
        lifted = obj[2] - 0.02 > self.obj_init_pos[2]
        # Increase reward when properly grabbed obj
        grasp_success = near_object and lifted and not pinched_without_obj
        if grasp_success:
            reward += 1. + 5. * reward_utils.hamacher_product(
                above_floor, in_place)
        # Maximize reward on success
        if target_to_obj < self.TARGET_RADIUS:
            reward = 10.

        return (reward, near_object, grasp_success, target_to_obj,
                object_grasped, in_place)
Пример #24
0
    def _gripper_caging_reward(self, action, obj_position, obj_radius):
        pad_success_margin = 0.05
        grip_success_margin = obj_radius + 0.005
        x_z_success_margin = 0.01

        tcp = self.tcp_center
        left_pad = self.get_body_com('leftpad')
        right_pad = self.get_body_com('rightpad')
        delta_object_y_left_pad = left_pad[1] - obj_position[1]
        delta_object_y_right_pad = obj_position[1] - right_pad[1]
        right_caging_margin = abs(
            abs(obj_position[1] - self.init_right_pad[1]) - pad_success_margin)
        left_caging_margin = abs(
            abs(obj_position[1] - self.init_left_pad[1]) - pad_success_margin)

        right_caging = reward_utils.tolerance(
            delta_object_y_right_pad,
            bounds=(obj_radius, pad_success_margin),
            margin=right_caging_margin,
            sigmoid='long_tail',
        )
        left_caging = reward_utils.tolerance(
            delta_object_y_left_pad,
            bounds=(obj_radius, pad_success_margin),
            margin=left_caging_margin,
            sigmoid='long_tail',
        )

        right_gripping = reward_utils.tolerance(
            delta_object_y_right_pad,
            bounds=(obj_radius, grip_success_margin),
            margin=right_caging_margin,
            sigmoid='long_tail',
        )
        left_gripping = reward_utils.tolerance(
            delta_object_y_left_pad,
            bounds=(obj_radius, grip_success_margin),
            margin=left_caging_margin,
            sigmoid='long_tail',
        )

        assert right_caging >= 0 and right_caging <= 1
        assert left_caging >= 0 and left_caging <= 1

        y_caging = reward_utils.hamacher_product(right_caging, left_caging)
        y_gripping = reward_utils.hamacher_product(right_gripping,
                                                   left_gripping)

        assert y_caging >= 0 and y_caging <= 1

        tcp_xz = tcp + np.array([0., -tcp[1], 0.])
        obj_position_x_z = np.copy(obj_position) + np.array(
            [0., -obj_position[1], 0.])
        tcp_obj_norm_x_z = np.linalg.norm(tcp_xz - obj_position_x_z, ord=2)
        init_obj_x_z = self.obj_init_pos + np.array(
            [0., -self.obj_init_pos[1], 0.])
        init_tcp_x_z = self.init_tcp + np.array([0., -self.init_tcp[1], 0.])

        tcp_obj_x_z_margin = np.linalg.norm(init_obj_x_z - init_tcp_x_z,
                                            ord=2) - x_z_success_margin
        x_z_caging = reward_utils.tolerance(
            tcp_obj_norm_x_z,
            bounds=(0, x_z_success_margin),
            margin=tcp_obj_x_z_margin,
            sigmoid='long_tail',
        )

        assert right_caging >= 0 and right_caging <= 1
        gripper_closed = min(max(0, action[-1]), 1)
        assert gripper_closed >= 0 and gripper_closed <= 1
        caging = reward_utils.hamacher_product(y_caging, x_z_caging)
        assert caging >= 0 and caging <= 1

        if caging > 0.95:
            gripping = y_gripping
        else:
            gripping = 0.
        assert gripping >= 0 and gripping <= 1

        caging_and_gripping = (caging + gripping) / 2
        assert caging_and_gripping >= 0 and caging_and_gripping <= 1

        return caging_and_gripping
Пример #25
0
    def compute_reward(self, action, obs):
        tcp = self.tcp_center
        obj = obs[4:7]
        obj_head = self._get_site_pos('pegHead')
        tcp_opened = obs[3]
        target = self._target_pos
        tcp_to_obj = np.linalg.norm(obj - tcp)
        scale = np.array([1., 2., 2.])
        #  force agent to pick up object then insert
        obj_to_target = np.linalg.norm((obj_head - target) * scale)

        in_place_margin = np.linalg.norm(
            (self.peg_head_pos_init - target) * scale)
        in_place = reward_utils.tolerance(
            obj_to_target,
            bounds=(0, self.TARGET_RADIUS),
            margin=in_place_margin,
            sigmoid='long_tail',
        )
        ip_orig = in_place
        brc_col_box_1 = self._get_site_pos(
            'bottom_right_corner_collision_box_1')
        tlc_col_box_1 = self._get_site_pos('top_left_corner_collision_box_1')

        brc_col_box_2 = self._get_site_pos(
            'bottom_right_corner_collision_box_2')
        tlc_col_box_2 = self._get_site_pos('top_left_corner_collision_box_2')
        collision_box_bottom_1 = reward_utils.rect_prism_tolerance(
            curr=obj_head, one=tlc_col_box_1, zero=brc_col_box_1)
        collision_box_bottom_2 = reward_utils.rect_prism_tolerance(
            curr=obj_head, one=tlc_col_box_2, zero=brc_col_box_2)
        collision_boxes = reward_utils.hamacher_product(
            collision_box_bottom_2, collision_box_bottom_1)
        in_place = reward_utils.hamacher_product(in_place, collision_boxes)

        pad_success_margin = 0.03
        object_reach_radius = 0.01
        x_z_margin = 0.005
        obj_radius = 0.0075

        object_grasped = self._gripper_caging_reward(
            action,
            obj,
            object_reach_radius=object_reach_radius,
            obj_radius=obj_radius,
            pad_success_thresh=pad_success_margin,
            xz_thresh=x_z_margin,
            high_density=True)
        if tcp_to_obj < 0.08 and (tcp_opened > 0) and (obj[2] - 0.01 >
                                                       self.obj_init_pos[2]):
            object_grasped = 1.
        in_place_and_object_grasped = reward_utils.hamacher_product(
            object_grasped, in_place)
        reward = in_place_and_object_grasped

        if tcp_to_obj < 0.08 and (tcp_opened > 0) and (obj[2] - 0.01 >
                                                       self.obj_init_pos[2]):
            reward += 1. + 5 * in_place

        if obj_to_target <= 0.07:
            reward = 10.

        return [
            reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped,
            in_place, collision_boxes, ip_orig
        ]
Пример #26
0
    def _gripper_caging_reward(self,
                               action,
                               obj_pos,
                               obj_radius,
                               pad_success_thresh,
                               object_reach_radius,
                               xz_thresh,
                               desired_gripper_effort=1.0,
                               high_density=False,
                               medium_density=False):
        """Reward for agent grasping obj
            Args:
                action(np.ndarray): (4,) array representing the action
                    delta(x), delta(y), delta(z), gripper_effort
                obj_pos(np.ndarray): (3,) array representing the obj x,y,z
                obj_radius(float):radius of object's bounding sphere
                pad_success_thresh(float): successful distance of gripper_pad
                    to object
                object_reach_radius(float): successful distance of gripper center
                    to the object.
                xz_thresh(float): successful distance of gripper in x_z axis to the
                    object. Y axis not included since the caging function handles
                        successful grasping in the Y axis.
        """
        if high_density and medium_density:
            raise ValueError("Can only be either high_density or medium_density")
        # MARK: Left-right gripper information for caging reward----------------
        left_pad = self.get_body_com('leftpad')
        right_pad = self.get_body_com('rightpad')

        # get current positions of left and right pads (Y axis)
        pad_y_lr = np.hstack((left_pad[1], right_pad[1]))
        # compare *current* pad positions with *current* obj position (Y axis)
        pad_to_obj_lr = np.abs(pad_y_lr - obj_pos[1])
        # compare *current* pad positions with *initial* obj position (Y axis)
        pad_to_objinit_lr = np.abs(pad_y_lr - self.stick_init_pos[1])

        caging_lr_margin = np.abs(pad_to_objinit_lr - pad_success_thresh)
        caging_lr = [reward_utils.tolerance(
            pad_to_obj_lr[i],  # "x" in the description above
            bounds=(obj_radius, pad_success_thresh),
            margin=caging_lr_margin[i],  # "margin" in the description above
            sigmoid='long_tail',
        ) for i in range(2)]
        caging_y = reward_utils.hamacher_product(*caging_lr)

        # MARK: X-Z gripper information for caging reward-----------------------
        tcp = self.tcp_center
        xz = [0, 2]

        caging_xz_margin = np.linalg.norm(self.stick_init_pos[xz] - self.init_tcp[xz])
        caging_xz_margin -= xz_thresh
        caging_xz = reward_utils.tolerance(
            np.linalg.norm(tcp[xz] - obj_pos[xz]),  # "x" in the description above
            bounds=(0, xz_thresh),
            margin=caging_xz_margin,  # "margin" in the description above
            sigmoid='long_tail',
        )

        # MARK: Closed-extent gripper information for caging reward-------------
        gripper_closed = min(max(0, action[-1]), desired_gripper_effort) \
                         / desired_gripper_effort

        # MARK: Combine components----------------------------------------------
        caging = reward_utils.hamacher_product(caging_y, caging_xz)
        gripping = gripper_closed if caging > 0.97 else 0.
        caging_and_gripping = reward_utils.hamacher_product(caging, gripping)

        if high_density:
            caging_and_gripping = (caging_and_gripping + caging) / 2
        if medium_density:
            tcp = self.tcp_center
            tcp_to_obj = np.linalg.norm(obj_pos - tcp)
            tcp_to_obj_init = np.linalg.norm(self.stick_init_pos - self.init_tcp)
            reach_margin = abs(tcp_to_obj_init - object_reach_radius)
            reach = reward_utils.tolerance(
                tcp_to_obj,
                bounds=(0, object_reach_radius),
                margin=reach_margin,
                sigmoid='long_tail',
            )
            caging_and_gripping = (caging_and_gripping + reach) / 2

        return caging_and_gripping
Пример #27
0
    def compute_reward(self, action, obs):
        _TARGET_RADIUS = 0.05
        tcp = self.tcp_center
        stick = obs[4:7]
        end_of_stick = self._get_site_pos('stick_end')
        container = obs[11:14] + np.array([0.05, 0., 0.])
        container_init_pos = self.obj_init_pos + np.array([0.05, 0., 0.])
        handle = obs[11:14]
        tcp_opened = obs[3]
        target = self._target_pos
        tcp_to_stick = np.linalg.norm(stick - tcp)
        handle_to_target = np.linalg.norm(handle - target)

        yz_scaling = np.array([1., 1., 2.])
        stick_to_container = np.linalg.norm((stick - container) * yz_scaling)
        stick_in_place_margin = (np.linalg.norm(
            (self.stick_init_pos - container_init_pos) * yz_scaling))
        stick_in_place = reward_utils.tolerance(
            stick_to_container,
            bounds=(0, _TARGET_RADIUS),
            margin=stick_in_place_margin,
            sigmoid='long_tail',
        )

        stick_to_target = np.linalg.norm(stick - target)
        stick_in_place_margin_2 = np.linalg.norm(self.stick_init_pos - target)
        stick_in_place_2 = reward_utils.tolerance(
            stick_to_target,
            bounds=(0, _TARGET_RADIUS),
            margin=stick_in_place_margin_2,
            sigmoid='long_tail',
        )

        container_to_target = np.linalg.norm(container - target)
        container_in_place_margin = np.linalg.norm(self.obj_init_pos - target)
        container_in_place = reward_utils.tolerance(
            container_to_target,
            bounds=(0, _TARGET_RADIUS),
            margin=container_in_place_margin,
            sigmoid='long_tail',
        )

        object_grasped = self._gripper_caging_reward(action=action,
                                                     obj_pos=stick,
                                                     obj_radius=0.014,
                                                     pad_success_thresh=0.05,
                                                     object_reach_radius=0.01,
                                                     xz_thresh=0.01,
                                                     high_density=True)

        grasp_success = (tcp_to_stick < 0.02 and (tcp_opened > 0)
                         and (stick[2] - 0.01 > self.stick_init_pos[2]))
        object_grasped = 1 if grasp_success else object_grasped

        in_place_and_object_grasped = reward_utils.hamacher_product(
            object_grasped, stick_in_place)
        reward = in_place_and_object_grasped

        if grasp_success:
            reward = 1. + in_place_and_object_grasped + 5. * stick_in_place

            if self._stick_is_inserted(handle, end_of_stick):
                reward = 1. + in_place_and_object_grasped + 5. + \
                         2. * stick_in_place_2 + 1. * container_in_place

                if handle_to_target <= 0.12:
                    reward = 10.

        return [
            reward, tcp_to_stick, tcp_opened, handle_to_target, object_grasped,
            stick_in_place
        ]
Пример #28
0
    def _gripper_caging_reward(self,
                               action,
                               obj_pos,
                               obj_radius,
                               pad_success_thresh,
                               object_reach_radius,
                               xz_thresh,
                               desired_gripper_effort=1.0,
                               high_density=False,
                               medium_density=False):
        """Reward for agent grasping obj
            Args:
                action(np.ndarray): (4,) array representing the action
                    delta(x), delta(y), delta(z), gripper_effort
                obj_pos(np.ndarray): (3,) array representing the obj x,y,z
                obj_radius(float):radius of object's bounding sphere
                pad_success_thresh(float): successful distance of gripper_pad
                    to object
                object_reach_radius(float): successful distance of gripper center
                    to the object.
                xz_thresh(float): successful distance of gripper in x_z axis to the
                    object. Y axis not included since the caging function handles
                        successful grasping in the Y axis.
        """
        if high_density and medium_density:
            raise ValueError(
                "Can only be either high_density or medium_density")
        # MARK: Left-right gripper information for caging reward----------------
        left_pad = self.get_body_com('leftpad')
        right_pad = self.get_body_com('rightpad')

        # get current positions of left and right pads (Y axis)
        pad_y_lr = np.hstack((left_pad[1], right_pad[1]))
        # compare *current* pad positions with *current* obj position (Y axis)
        pad_to_obj_lr = np.abs(pad_y_lr - obj_pos[1])
        # compare *current* pad positions with *initial* obj position (Y axis)
        pad_to_objinit_lr = np.abs(pad_y_lr - self.obj_init_pos[1])

        # Compute the left/right caging rewards. This is crucial for success,
        # yet counterintuitive mathematically because we invented it
        # accidentally.
        #
        # Before touching the object, `pad_to_obj_lr` ("x") is always separated
        # from `caging_lr_margin` ("the margin") by some small number,
        # `pad_success_thresh`.
        #
        # When far away from the object:
        #       x = margin + pad_success_thresh
        #       --> Thus x is outside the margin, yielding very small reward.
        #           Here, any variation in the reward is due to the fact that
        #           the margin itself is shifting.
        # When near the object (within pad_success_thresh):
        #       x = pad_success_thresh - margin
        #       --> Thus x is well within the margin. As long as x > obj_radius,
        #           it will also be within the bounds, yielding maximum reward.
        #           Here, any variation in the reward is due to the gripper
        #           moving *too close* to the object (i.e, blowing past the
        #           obj_radius bound).
        #
        # Therefore, before touching the object, this is very nearly a binary
        # reward -- if the gripper is between obj_radius and pad_success_thresh,
        # it gets maximum reward. Otherwise, the reward very quickly falls off.
        #
        # After grasping the object and moving it away from initial position,
        # x remains (mostly) constant while the margin grows considerably. This
        # penalizes the agent if it moves *back* toward `obj_init_pos`, but
        # offers no encouragement for leaving that position in the first place.
        # That part is left to the reward functions of individual environments.
        caging_lr_margin = np.abs(pad_to_objinit_lr - pad_success_thresh)
        caging_lr = [
            reward_utils.tolerance(
                pad_to_obj_lr[i],  # "x" in the description above
                bounds=(obj_radius, pad_success_thresh),
                margin=caging_lr_margin[
                    i],  # "margin" in the description above
                sigmoid='long_tail',
            ) for i in range(2)
        ]
        caging_y = reward_utils.hamacher_product(*caging_lr)

        # MARK: X-Z gripper information for caging reward-----------------------
        tcp = self.tcp_center
        xz = [0, 2]

        # Compared to the caging_y reward, caging_xz is simple. The margin is
        # constant (something in the 0.3 to 0.5 range) and x shrinks as the
        # gripper moves towards the object. After picking up the object, the
        # reward is maximized and changes very little
        caging_xz_margin = np.linalg.norm(self.obj_init_pos[xz] -
                                          self.init_tcp[xz])
        caging_xz_margin -= xz_thresh
        caging_xz = reward_utils.tolerance(
            np.linalg.norm(tcp[xz] -
                           obj_pos[xz]),  # "x" in the description above
            bounds=(0, xz_thresh),
            margin=caging_xz_margin,  # "margin" in the description above
            sigmoid='long_tail',
        )

        # MARK: Closed-extent gripper information for caging reward-------------
        gripper_closed = min(max(0, action[-1]), desired_gripper_effort) \
                         / desired_gripper_effort

        # MARK: Combine components----------------------------------------------
        caging = reward_utils.hamacher_product(caging_y, caging_xz)
        gripping = gripper_closed if caging > 0.97 else 0.
        caging_and_gripping = reward_utils.hamacher_product(caging, gripping)

        if high_density:
            caging_and_gripping = (caging_and_gripping + caging) / 2
        if medium_density:
            tcp = self.tcp_center
            tcp_to_obj = np.linalg.norm(obj_pos - tcp)
            tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp)
            # Compute reach reward
            # - We subtract `object_reach_radius` from the margin so that the
            #   reward always starts with a value of 0.1
            reach_margin = abs(tcp_to_obj_init - object_reach_radius)
            reach = reward_utils.tolerance(
                tcp_to_obj,
                bounds=(0, object_reach_radius),
                margin=reach_margin,
                sigmoid='long_tail',
            )
            caging_and_gripping = (caging_and_gripping + reach) / 2

        return caging_and_gripping