Exemplo n.º 1
0
class PickAndPlaceEnv(MultitaskEnv, Serializable):
    """
    A simple env where a 'cursor' robot can push objects around.

    TODO: refactor to have base class shared with point2d.py code
    TODO: rather than recreating a drawer, just save the previous drawers
    """
    def __init__(
        self,
        num_objects=2,
        # Environment dynamics
        action_scale=1.0,
        ball_radius=.75,
        boundary_dist=4,
        object_radius=0.50,
        min_grab_distance=0.5,
        walls=None,
        # Rewards
        action_l2norm_penalty=0,
        reward_type="dense",
        success_threshold=0.60,
        # Reset settings
        fixed_goal=None,
        fixed_init_position=None,
        init_position_strategy='random',
        start_near_object=False,
        # Visualization settings
        images_are_rgb=True,
        render_dt_msec=0,
        render_onscreen=False,
        render_size=84,
        get_image_base_render_size=None,
        show_goal=True,
        # Goal sampling
        goal_samplers=None,
        goal_sampling_mode='random',
        num_presampled_goals=10000,
        object_reward_only=False,
    ):
        """
        :param get_image_base_render_size: If set, then the drawer will always
        generate images according to this size, and (smoothly) downsampled
        images will be returned by `get_image`
        """
        walls = walls or []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if action_scale <= 0:
            raise ValueError("Invalid action scale: {}".format(action_scale))
        if init_position_strategy not in {
                'random', 'on_random_object', 'fixed'
        }:
            raise ValueError('Invalid init position strategy: {}'.format(
                init_position_strategy))

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.success_threshold = success_threshold
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.init_position_strategy = init_position_strategy
        self.images_are_rgb = images_are_rgb
        self._show_goal = show_goal

        self._all_objects = [
            Object(
                position=np.zeros((2, )),
                color=Object.IDX_TO_COLOR[i],
                radius=ball_radius if i == 0 else object_radius,
                min_pos=-self.boundary_dist,
                max_pos=self.boundary_dist,
            ) for i in range(num_objects + 1)
        ]
        self.min_grab_distance = min_grab_distance

        u = np.ones(3)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2 * (num_objects + 1))
        self.num_objects = num_objects
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        if get_image_base_render_size:
            base_width, base_height = get_image_base_render_size
            self._drawer = PygameViewer(
                screen_width=base_width,
                screen_height=base_height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
            self._fixed_get_image_render_size = True
        else:
            self._drawer = None
            self._fixed_get_image_render_size = False
        self._render_drawer = None
        if fixed_init_position is None:
            fixed_init_position = np.zeros_like(self.obs_range.low)
        self._fixed_init_position = fixed_init_position

        self._presampled_goals = None
        goal_samplers = goal_samplers or {}
        goal_samplers['fixed'] = PickAndPlaceEnv._sample_fixed_goal
        goal_samplers['presampled'] = PickAndPlaceEnv._sample_presampled_goals
        goal_samplers['random'] = PickAndPlaceEnv._sample_random_feasible_goals
        self._custom_goal_sampler = goal_samplers
        self._num_presampled_goals = num_presampled_goals
        if goal_sampling_mode is None:
            if fixed_goal:
                goal_sampling_mode = 'fixed'
            else:
                goal_sampling_mode = 'presampled'
        self.goal_sampling_mode = goal_sampling_mode
        self.object_reward_only = object_reward_only

    @property
    def cursor(self):
        return self._all_objects[0]

    @property
    def objects(self):
        return self._all_objects[1:]

    def step(self, raw_action):
        velocities = raw_action[:2]
        grab = raw_action[2] > 0

        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        old_position = self.cursor.position.copy()
        new_position = old_position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(old_position, new_position)

        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(old_position,
                                                     new_position)

        if grab:
            grabbed_obj = self._grab_object()
            if grabbed_obj:
                grabbed_obj.move(velocities)

        self.cursor.move(new_position - old_position)

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = self._get_info()
        done = False
        return ob, reward, done, info

    def _get_info(self):
        distance_to_target = self.cursor.distance_to_target()
        is_success = distance_to_target < self.success_threshold
        info = {
            'distance_to_target_cursor': distance_to_target,
            'success_cursor': is_success,
        }
        for i, obj in enumerate(self.objects):
            obj_distance = obj.distance_to_target()
            success = obj_distance < self.success_threshold
            info['distance_to_target_obj_{}'.format(i)] = obj_distance
            info['success_obj_{}'.format(i)] = success
        return info

    def reset(self):
        goal = self.sample_goal()['state_desired_goal']
        self._set_target_positions(goal)

        if self.init_position_strategy == 'random':
            init_pos = (
                self.observation_space.spaces['state_observation'].sample())
        elif self.init_position_strategy == 'fixed':
            init_pos = self._fixed_init_position.copy()
        elif self.init_position_strategy == 'on_random_object':
            init_pos = (
                self.observation_space.spaces['state_observation'].sample())
            start_i = 2 + 2 * random.randint(0, len(self._all_objects) - 2)
            end_i = start_i + 2
            init_pos[:2] = init_pos[start_i:end_i].copy()
        else:
            raise ValueError(self.init_position_strategy)
        self._set_positions(init_pos)

        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        positions = self._get_positions()
        target_positions = self._get_target_positions()
        return dict(
            observation=positions.copy(),
            desired_goal=target_positions.copy(),
            achieved_goal=positions.copy(),
            state_observation=positions.copy(),
            state_desired_goal=target_positions.copy(),
            state_achieved_goal=positions.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        if self.object_reward_only:
            achieved_goals = achieved_goals[:, 2:]
            desired_goals = desired_goals[:, 2:]
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.success_threshold *
                     len(self._all_objects)).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == "dense_l1":
            return -np.linalg.norm(
                achieved_goals - desired_goals, axis=-1, ord=1)
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
                'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        for i, obj in enumerate(self.objects):
            stat_name = 'distance_to_target_obj_{}'.format(i)
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._get_target_positions(),
            'state_desired_goal': self._get_target_positions(),
        }

    def set_goal(self, goal):
        self._set_target_positions(goal['state_desired_goal'])

    def sample_goals(self, batch_size):
        goal_sampler = self._custom_goal_sampler[self.goal_sampling_mode]
        return goal_sampler(self, batch_size)

    def _sample_random_feasible_goals(self, batch_size):
        if len(self.walls) > 0:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        else:
            goals = np.random.uniform(
                self.obs_range.low,
                self.obs_range.high,
                size=(batch_size, self.obs_range.low.size),
            )
        state_goals = goals
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _sample_fixed_goal(self, batch_size):
        goals = state_goals = np.repeat(self.fixed_goal.copy()[None],
                                        batch_size, 0)
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _sample_presampled_goals(self, batch_size):
        if self._presampled_goals is None:
            self._presampled_goals = self._sample_random_feasible_goals(
                self._num_presampled_goals)
        random_idxs = np.random.choice(len(
            list(self._presampled_goals.values())[0]),
                                       size=batch_size)
        goals = self._presampled_goals['desired_goal'][random_idxs]
        state_goals = self._presampled_goals['state_desired_goal'][random_idxs]
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _set_positions(self, positions):
        for i, obj in enumerate(self._all_objects):
            start_i = i * 2
            end_i = i * 2 + 2
            obj.position = positions[start_i:end_i]

    def _set_target_positions(self, target_positions):
        for i, obj in enumerate(self._all_objects):
            start_i = i * 2
            end_i = i * 2 + 2
            obj.target_position = target_positions[start_i:end_i]

    def _get_positions(self):
        return np.concatenate([obj.position for obj in self._all_objects])

    def _get_target_positions(self):
        return np.concatenate(
            [obj.target_position for obj in self._all_objects])

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self._drawer is None or (not self._fixed_get_image_render_size and
                                    (self._drawer.width != width
                                     or self._drawer.height != height)):
            if width != height:
                raise NotImplementedError()
            self._drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self._draw(self._drawer)
        if width and height:
            wh_size = (width, height)
        else:
            wh_size = None
        img = self._drawer.get_image(wh_size)
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (r + g + b).transpose() / 3
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._set_positions(goal)

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        self._set_positions(state["state_observation"])
        self._set_target_positions(state["state_desired_goal"])

    def render(self, mode='human', close=False):
        if close:
            self._render_drawer = None
            return

        if self._render_drawer is None or self._render_drawer.terminated:
            self._render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self._draw(self._render_drawer)
        self._render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self._render_drawer.check_for_exit()

    def _draw(self, drawer):
        drawer.fill(Color('black'))
        for obj in self._all_objects:
            obj.draw(drawer, draw_target_position=self._show_goal)

        for wall in self.walls:
            draw_wall(drawer, wall)
        drawer.render()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target_obj_{}'.format(i)
                for i in range(len(self.objects))
        ] + ['success_obj_{}'.format(i) for i in range(len(self.objects))] + [
                'distance_to_target_cursor',
                'success_cursor',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def _grab_object(self):
        closest_object = None
        min_dis = None
        for obj in self.objects:
            distance = obj.distance(self.cursor.position)
            if ((distance <= self.min_grab_distance)
                    and (closest_object is None or distance < min_dis)):
                min_dis = distance
                closest_object = obj
        return closest_object

    def goal_conditioned_diagnostics(self, paths, goals):
        statistics = OrderedDict()
        stat_to_lists = defaultdict(list)
        for path, goal in zip(paths, goals):
            difference = path['observations'] - goal
            for i in range(len(self._all_objects)):
                distance = np.linalg.norm(difference[:, 2 * i:2 * i + 2],
                                          axis=-1)
                distance_key = 'distance_to_target_obj_{}'.format(i)
                stat_to_lists[distance_key].append(distance)
                success_key = 'success_obj_{}'.format(i)
                stat_to_lists[success_key].append(
                    distance < self.success_threshold)
        for stat_name, stat_list in stat_to_lists.items():
            statistics.update(
                create_stats_ordered_dict(
                    stat_name,
                    stat_list,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/final'.format(stat_name),
                    [s[-1:] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/initial'.format(stat_name),
                    [s[:1] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
        return statistics
Exemplo n.º 2
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=256,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.5,
            boundary_dist=4,
            ball_radius=0.15,
            walls=None,
            init_pos_range=None,
            target_pos_range=None,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            n_bins=10,
            use_count_reward=False,
            show_discrete_grid=False,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        # if fixed_goal is not None:
        #     fixed_goal = np.array(fixed_goal)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.n_bins = n_bins
        self.use_count_reward = use_count_reward
        self.show_discrete_grid = show_discrete_grid
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal

        self.x_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.y_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')

        if not init_pos_range:
            self.init_pos_range = self.obs_range
        else:
            assert np.all(np.abs(init_pos_range) < boundary_dist), (
                f"Init position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )
            low, high = init_pos_range
            self.init_pos_range = spaces.Box(np.array(low),
                                             np.array(high),
                                             dtype='float32')

        if not target_pos_range:
            self.target_pos_range = self.obs_range
        else:
            assert np.all(np.abs(target_pos_range) < boundary_dist), (
                f"Goal position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )

            low, high = target_pos_range
            self.target_pos_range = spaces.Box(np.array(low),
                                               np.array(high),
                                               dtype='float32')

        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('onehot_observation',
             spaces.Box(0,
                        1,
                        shape=(2 * (self.n_bins + 1), ),
                        dtype=np.float32)),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None

        self.reset()

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        x_d, y_d = ob['discrete_observation']
        self.bin_counts[x_d, y_d] += 1

        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            # TODO (kevintli): Make this work for general mazes
            # 'manhattan_dist_to_target': self._medium_maze_manhattan_distance(ob['state_achieved_goal'], ob['state_desired_goal']),
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
            'ext_reward': self._ext_reward,
        }
        if self.use_count_reward:
            info['count_bonus'] = self._count_bonus

        done = False
        return ob, reward, done, info

    def reset(self):
        # TODO: Make this more general
        self._target_position = self.sample_goal()['state_desired_goal']
        # if self.randomize_position_on_reset:
        self._position = self._sample_position(
            # self.obs_range.low,
            # self.obs_range.high,
            self.init_pos_range.low,
            self.init_pos_range.high,
        )
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        if np.all(low == high):
            return low
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def clear_bin_counts(self):
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

    def _discretize_observation(self, obs):
        x, y = obs['state_observation'].copy()
        x_d, y_d = np.digitize(x, self.x_bins), np.digitize(y, self.y_bins)
        return np.array([x_d, y_d])

    def _get_obs(self):
        obs = collections.OrderedDict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

        # Update with discretized state
        pos_discrete = self._discretize_observation(obs)
        pos_onehot = np.eye(self.n_bins + 1)[pos_discrete]
        obs['discrete_observation'] = pos_discrete
        obs['onehot_observation'] = pos_onehot

        return obs

    def _medium_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_x = -self.boundary_dist / 3
        left_wall_bottom = self.inner_wall_max_dist
        right_wall_x = self.boundary_dist / 3
        right_wall_top = -self.inner_wall_max_dist

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1, s2 = s1[None], s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [left_wall_bottom, right_wall_top, 0]
        boundary_xs = [left_wall_x, right_wall_x, self.boundary_dist, -4.0001]
        y_directions = [
            1, -1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(3):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

        return x_dist + y_dist

    def compute_rewards(self, actions, obs, reward_type=None):
        reward_type = reward_type or self.reward_type

        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if reward_type == "sparse":
            r = -(d > self.target_radius).astype(np.float32)
        elif reward_type == "dense":
            r = -d
        elif reward_type == "vectorized_dense":
            r = -np.abs(achieved_goals - desired_goals)
        elif reward_type == "medium_maze_ground_truth_manhattan":
            """
            # Use maze distance from current position to goal as the negative reward.
            # This is specifically for the medium Maze-v0 env, which has two vertical walls.
            """
            r = -self._medium_maze_manhattan_distance(achieved_goals,
                                                      desired_goals)
        elif reward_type == "laplace_smoothing":
            # Laplace smoothing: 1 within the goal region, 1/(n+2) at all other states
            # (where n is the number of visitations)
            r = np.zeros(d.shape)
            pos_d = obs['discrete_observation']
            r = 1 / (self.bin_counts[pos_d[:, 0], pos_d[:, 1]] + 2)
            r[d <= self.target_radius] = 1
        elif reward_type == "none":
            r = np.zeros(d.shape)
        else:
            raise NotImplementedError(
                f"Unrecognized reward type: {reward_type}")

        if self.use_count_reward:
            # TODO: Add different count based strategies
            pos_d = obs['discrete_observation']
            self._ext_reward = r.copy()
            self._count_bonus = 1 / np.sqrt(self.bin_counts[pos_d[:, 0],
                                                            pos_d[:, 1]])
            r += self._count_bonus
        else:
            self._ext_reward = r.copy()

        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
                'ext_reward',
                'count_bonus',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        # if self.fixed_goal:
        #     goals = np.repeat(
        #         self.fixed_goal.copy()[None],
        #         batch_size,
        #         0)
        # else:
        goals = np.zeros((batch_size, self.obs_range.low.size))
        for b in range(batch_size):
            if batch_size > 1:
                logging.warning("This is very slow!")
            goals[b, :] = self._sample_position(
                # self.obs_range.low,
                # self.obs_range.high,
                self.target_pos_range.low,
                self.target_pos_range.high,
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                # TODO(justinvyu): Action scale = 1 breaks rendering, why?
                # x_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                # y_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color('white'))

        if self.show_discrete_grid:
            for x in self.x_bins:
                drawer.draw_segment((x, -self.boundary_dist),
                                    (x, self.boundary_dist),
                                    Color(220, 220, 220, 25),
                                    aa=False)
            for y in self.y_bins:
                drawer.draw_segment((-self.boundary_dist, y),
                                    (self.boundary_dist, y),
                                    Color(220, 220, 220, 25),
                                    aa=False)

        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        try:
            drawer.draw_solid_circle(
                self._position,
                self.ball_radius,
                Color('blue'),
            )
        except ValueError as e:
            print('\n\n RENDER ERROR \n\n')

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self, mode='human', width=None, height=None, close=False):
        if close:
            self.render_drawer = None
            return
        if mode == 'rgb_array':
            return self.get_image(width=width, height=height)

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                # x_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                # y_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # def get_diagnostics(self, paths, prefix=''):
    #     statistics = OrderedDict()
    #     for stat_name in ('distance_to_target', ):
    #         stat_name = stat_name
    #         stat = get_stat_in_paths(paths, 'env_infos', stat_name)
    #         statistics.update(create_stats_ordered_dict(
    #             '%s%s' % (prefix, stat_name),
    #             stat,
    #             always_show_all_stats=True,
    #             ))
    #         statistics.update(create_stats_ordered_dict(
    #             'Final %s%s' % (prefix, stat_name),
    #             [s[-1] for s in stat],
    #             always_show_all_stats=True,
    #             ))
    #     return statistics
    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Exemplo n.º 3
0
class Point2DEnv(MultitaskEnv, Serializable):
    """A little 2D point whose life goal is to reach a target.

    Attributes
    ----------
    render_dt_msec : float
        seconds before the next frame in the image is rendered
    action_l2norm_penalty : float
        penalty scale for the actions by the agent
    render_onscreen : bool
        whether to include the rendering visually (instead of simply using the
        image for the observation)
    render_size : int
        width/length number of pixels in the rendered image
    reward_type : str
        the reward type. Must be one of: "sparse", "dense", or
        "vectorized_dense"
    action_scale : float
        the multiple from action to velocity
    target_radius : float
        the radius of the targeted position when being rendered
    boundary_dist : float
        the distance from the center to the boundary
    ball_radius : float
        the radius of the agent when being rendered
    walls : list of object
        a list of wall objects describing the position of the walls and how to
        handle collisions with said walls
    fixed_goal : [float, float] or None
        the goal to use. If set to None, it is picked randomly.
    randomize_position_on_reset : bool
        whether to initialize the position of the agent randomly
    images_are_rgb : bool
        specifies whether the image is RGB. Otherwise, it's black and white
    show_goal : bool
        whether to render the goal(s)
    images_in_obs : bool
        whether to use the image in the observation
    image_size : int
        number of elements in the image. Set to 0 if images are not being used.
    action_space : gym.spaces.*
        the action space of the environment
    obs_range : gym.spaces.*
        the range of the initial position of the agent
    context_space  : gym.spaces.*
        the context space of the environment
    observation_space : gym.spaces.*
        the observation space of the environment
    drawer : multiworld.envs.pygame.pygame_viewer.PygameViewer or None
        The drawer for the images of the environment. Set to None if images are
        not being used.
    render_drawer : multiworld.envs.pygame.pygame_viewer.PygameViewer or None
        The drawer for the images of the environment if the environment is
        being rendered. Set to None if images are not being used.
    horizon : int
        environment time horizon
    t : int
        number of steps since the start of the most recent episode
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=32,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.60,
            boundary_dist=4,
            ball_radius=0.50,
            walls=None,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=True,  # else black and white
            show_goal=True,
            images_in_obs=True,
            **kwargs):
        """Instantiate the environment.

        Parameters
        ----------
        render_dt_msec : float
            seconds before the next frame in the image is rendered
        action_l2norm_penalty : float
            penalty scale for the actions by the agent
        render_onscreen : bool
            whether to include the rendering visually (instead of simply using
            the image for the observation)
        render_size : int
            width/length number of pixels in the rendered image
        reward_type : str
            the reward type. Must be one of: "sparse", "dense", or
            "vectorized_dense"
        action_scale : float
            the multiple from action to velocity
        target_radius : float
            the radius of the targeted position when being rendered
        boundary_dist : float
            the distance from the center to the boundary
        ball_radius : float
            the radius of the agent when being rendered
        walls : list of object
            a list of wall objects describing the position of the walls and how
            to handle collisions with said walls
        fixed_goal : [float, float] or None
            the goal to use. If set to None, it is picked randomly.
        randomize_position_on_reset : bool
            whether to initialize the position of the agent randomly
        images_are_rgb : bool
            specifies whether the image is RGB. Otherwise, it's black and white
        show_goal : bool
            whether to render the goal(s)
        images_in_obs : bool
            whether to use the image in the obsevation
        kwargs : dict
            additional arguments. Unused here.
        """
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if len(kwargs) > 0:
            logger = logging.getLogger(__name__)
            logger.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.images_in_obs = images_in_obs
        self.image_size = 1024 * (3 if self.images_are_rgb else 1)
        if not self.images_in_obs:
            self.image_size = 0

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros(2)

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.context_space = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Box(
            np.concatenate([np.zeros([self.image_size]), -o], 0),
            np.concatenate([np.ones([self.image_size]), o], 0),
            dtype='float32')

        self.drawer = None
        self.render_drawer = None
        self.horizon = 200
        self.t = 0

    @property
    def current_context(self):
        """Return the current goal by the environment."""
        return self._target_position

    def step(self, velocities):
        """Advance the simulation by one step.

        Parameters
        ----------
        velocities : array_like
            the action by the agent, defined as its velocities in the x and y
            directions

        Returns
        -------
        array_like
            agent's observation of the current environment
        float
            amount of reward associated with the previous state/action pair
        bool
            indicates whether the episode has ended
        dict
            contains other diagnostic information from the previous action
        """
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            # Hack: sometimes you get caught on two walls at a time. If you
            # process the input in the other direction, you might only get
            # caught on one wall instead.
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self.t += 1

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, {'ob': ob})
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = self.t >= self.horizon
        return ob, reward, done, info

    def reset(self):
        """Reset the environment."""
        self.t = 0
        self._target_position = self.sample_goal()['goals']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(
                self.obs_range.low,
                self.obs_range.high,
            )

        return self._get_obs()

    def _position_inside_wall(self, pos):
        """Return True if the agent is in a wall."""
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        """Sample a starting position for the agent."""
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        """Return the observation of the agent.

        See States in the description of the environment for more.
        """
        if self.images_in_obs:
            img = self.get_image(32, 32).reshape([-1]).astype(
                np.float32) / 255.0
            return np.concatenate([img, self._position.copy()], 0)
        else:
            return self._position.copy()

    def compute_rewards(self, actions, obs):
        """See parent class.

        The rewards are described in the Rewards section of the description of
        the environment.
        """
        achieved_goals = obs['ob'][:, self.image_size:]
        desired_goals = self._target_position[np.newaxis, :]
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_goal(self):
        """See parent class."""
        return self._target_position.copy()

    def sample_goals(self, batch_size):
        """See parent class.

        The goal is the desired x,y coordinates.
        """
        if self.fixed_goal is not None:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                if batch_size > 1:
                    logging.warning("This is very slow!")
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        return {'goals': goals}

    def set_position(self, pos):
        """Set the position of the agent."""
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    # ======================================================================= #
    #                     Functions for ImageEnv wrapper                      #
    # ======================================================================= #

    def get_image(self, width=None, height=None):
        """Return a black and white image."""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, b = img[:, :, 0], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def draw(self, drawer):
        """Create the image corresponding to the current state."""
        drawer.fill(Color('white'))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self, mode='human', close=False):
        """Render the environment state."""
        if mode == 'rgb_array':
            return self.get_image(self.render_size, self.render_size)

        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # ======================================================================= #
    #                      Visualization/utility methods                      #
    # ======================================================================= #

    def true_model(self, state, action):
        """Return the next position by the agent.

        Parameters
        ----------
        state : array_like
            the state by the agent
        action : array_like
            the action by the agent

        Returns
        -------
        array_like
            the next position.
        """
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )

    def true_states(self, state, actions):
        """Return the next states given a set of states and actions.

        Parameters
        ----------
        state : array_like
            the states by the agent
        actions : array_like
            the actions by the agent

        Returns
        -------
        list of array_like
            the next states
        """
        real_states = [state]
        for action in actions:
            next_state = self.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    def plot_trajectory(self, ax, states, actions, goal=None):
        """Plot the trajectory of an agent.

        Parameters
        ----------
        ax : matplotlib.axes.Axes
            the axis object to plot the figure on
        states : array_like
            the states by the agent
        actions : array_like
            the actions by the agent
        goal : [int, int]
            the x,y coordinates of the goal
        """
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                self.boundary_dist,
                self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                self.boundary_dist,
            ],
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                -self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -self.boundary_dist - 1,
            self.boundary_dist + 1,
        )
        ax.set_xlim(
            -self.boundary_dist - 1,
            self.boundary_dist + 1,
        )
Exemplo n.º 4
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=84,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.60,
            boundary_dist=4,
            ball_radius=0.50,
            walls=None,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if len(kwargs) > 0:
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = False
        return ob, reward, done, info

    def reset(self):
        self._target_position = self.sample_goal()['state_desired_goal']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(
                self.obs_range.low,
                self.obs_range.high,
            )

        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        return dict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        if not self.fixed_goal is None:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                if batch_size > 1:
                    logging.warning("This is very slow!")
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color('white'))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self, mode='human', close=False):
        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Exemplo n.º 5
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=256,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.5,
            boundary_dist=4,
            ball_radius=0.15,
            ball_pixel_radius=0,
            walls=None,
            init_pos_range=None,
            target_pos_range=None,
            sparse_goals=None,
            shuffle_states=False,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            n_bins=10,
            use_count_reward=False,
            show_discrete_grid=False,
            convert_obs_to_image=False,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        # if fixed_goal is not None:
        #     fixed_goal = np.array(fixed_goal)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.ball_pixel_radius = ball_pixel_radius
        self.walls = walls
        self.n_bins = n_bins
        self.use_count_reward = use_count_reward
        self.show_discrete_grid = show_discrete_grid
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.sparse_goals = sparse_goals
        self.shuffle_states = shuffle_states
        self.convert_obs_to_image = convert_obs_to_image

        self.x_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.y_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

        self._random_mapping_x = np.random.permutation(range(N_SHUFFLE_BINS))
        self._random_mapping_y = np.random.permutation(range(N_SHUFFLE_BINS))

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')

        print(
            f"[Point2D] Using boundary dist: {self.boundary_dist}, obs space: ({self.obs_range.low}, {self.obs_range.high})"
        )

        if not init_pos_range:
            self.init_pos_range = self.obs_range
        else:
            assert np.all(np.abs(init_pos_range) < boundary_dist), (
                f"Init position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )
            low, high = init_pos_range
            self.init_pos_range = spaces.Box(np.array(low),
                                             np.array(high),
                                             dtype='float32')
            print(
                f"[Point2D] Using initial pos range of low={low}, high={high}")

        if not target_pos_range:
            self.target_pos_range = self.obs_range
        else:
            assert np.all(np.abs(target_pos_range) < boundary_dist), (
                f"Goal position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )

            low, high = target_pos_range
            self.target_pos_range = spaces.Box(np.array(low),
                                               np.array(high),
                                               dtype='float32')

        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('onehot_observation',
             spaces.Box(0,
                        1,
                        shape=(2 * (self.n_bins + 1), ),
                        dtype=np.float32)),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None

        self.reset()

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        x_d, y_d = ob['discrete_observation']
        self.bin_counts[x_d, y_d] += 1

        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
            'ext_reward': self._ext_reward,
        }

        if self.sparse_goals is not None:
            # Set reward for anything within threshold of goal(s) to 1
            # (ignore any classifier reward)
            goals, threshold = self.sparse_goals
            min_dist = self.compute_min_dist(goals)
            info['is_sparse_success'] = min_dist < threshold

        if hasattr(self, 'wall_shape'):
            if self.wall_shape == 'medium-maze':
                info[
                    'manhattan_dist_to_target'] = self._medium_maze_manhattan_distance(
                        ob['state_achieved_goal'], ob['state_desired_goal'])
            elif self.wall_shape == 'hard-maze':
                info[
                    'manhattan_dist_to_target'] = self._hard_maze_goal_distance(
                        ob['state_achieved_goal'], ob['state_desired_goal'])
            elif self.wall_shape == 'double-medium-maze':
                goals, threshold = self.sparse_goals
                info['manhattan_dist_to_sparse_goal'] = self.compute_min_dist(
                    goals, self._double_maze_manhattan_distance)
                info['manhattan_dist_to_target'] = self.compute_min_dist(
                    ob['state_desired_goal'],
                    self._double_maze_manhattan_distance)

        if self.use_count_reward:
            info['count_bonus'] = self._count_bonus

        done = False
        return ob, reward, done, info

    def compute_min_dist(
        self,
        goals,
        distance_fn=lambda s1, s2: np.linalg.norm(s1 - s2, axis=-1)):
        dists = []
        if len(goals.shape) == 1:
            goals = goals[None]
        for goal in goals:
            dist = distance_fn(self._position, goal)
            dists.append(dist)
        min_dist = np.array(dists).min(axis=0)
        return min_dist

    def reset(self):
        # TODO: Make this more general
        self._target_position = self.sample_goal()['state_desired_goal']
        # if self.randomize_position_on_reset:
        self._position = self._sample_position(
            # self.obs_range.low,
            # self.obs_range.high,
            self.init_pos_range.low,
            self.init_pos_range.high,
        )
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        if np.all(low == high):
            return low
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def clear_bin_counts(self):
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

    def _discretize_observation(self, obs):
        x, y = obs['state_observation'].copy()
        x_d, y_d = np.digitize(x, self.x_bins), np.digitize(y, self.y_bins)
        return np.array([x_d, y_d])

    def _discretized_states(self, states, bins=16, low=-4, high=4):
        """
        Converts continuous to discrete states.
        
        Params
        - states: A shape (n, 2) batch of continuous observations
        - bins: Number of bins for both x and y coordinates
        - low: Lowest value (inclusive) for continuous x and y
        - high: Highest value (inclusive) for continuous x and y
        """
        bin_size = (high - low) / bins
        shifted_states = states - low
        return np.clip(shifted_states // bin_size, 0,
                       bins - 1).astype(np.int32)

    def _get_shuffled_states(self, states):
        if len(states.shape) == 1:
            states = states[None]
        states = self._discretized_states(states,
                                          bins=N_SHUFFLE_BINS,
                                          low=MAZE_LOW,
                                          high=MAZE_HIGH)
        results = (np.hstack([
            self._random_mapping_x[states[:, 0]][:, None],
            self._random_mapping_y[states[:, 1]][:, None]
        ]) - 8) / 2
        return results

    def _state_to_pixel_coords(self,
                               states,
                               state_min=-4,
                               state_max=4,
                               img_width=28):
        coords = (states - state_min) * (img_width / (state_max - state_min))

        # Need to subtract 1 because pygame rendering shifts everything to the left/up by 1 for some reason
        # (e.g. [-3, -3] is at 5, 5 instead of 6, 6). However this does not apply to the top/left edges.
        return np.clip(coords, 0, None).astype(np.int32)

    def _get_obs(self):
        state_obs = self._get_shuffled_states(self._position.copy()).squeeze() \
            if self.shuffle_states else self._position.copy()

        obs = collections.OrderedDict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=state_obs,
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

        # Update with discretized state
        pos_discrete = self._discretize_observation(obs)
        pos_onehot = np.eye(self.n_bins + 1)[pos_discrete]
        obs['discrete_observation'] = pos_discrete
        obs['onehot_observation'] = pos_onehot

        return obs

    def _hard_maze_goal_distance(self, s1, goal=None):
        s1 = s1.copy()
        if len(s1.shape) == 1:
            s1 = s1[None]

        x1, y1 = s1[:, 0], s1[:, 1]
        dist = np.zeros(len(s1))

        top_section = y1 <= -2
        dist[top_section] += np.abs(
            2 - x1[top_section])  # Move horizontally to corner
        dist[top_section] += -2 - y1[top_section]  # Move vertically to corner
        x1[top_section], y1[top_section] = 2, -2

        right_section = x1 >= 2
        dist[right_section] += x1[right_section] - 2  # Move horizontally
        dist[right_section] += np.abs(
            2 - y1[right_section])  # Move vertically to corner
        x1[right_section], y1[right_section] = 2, 2

        bottom_section = y1 >= 2
        dist[bottom_section] += np.abs(-2 -
                                       x1[bottom_section])  # Move horizontally
        dist[bottom_section] += y1[
            bottom_section] - 2  # Move vertically to corner
        x1[bottom_section], y1[bottom_section] = -2, 2

        left_section = x1 <= -2
        dist[left_section] += (-2 - x1[left_section])  # Move horizontally
        dist[left_section] += np.abs(
            y1[left_section])  # Move vertically to corner
        x1[left_section], y1[left_section] = -2, 0

        mid_section = np.logical_and(y1 <= 0, y1 >= -2)
        dist[mid_section] += np.abs(x1[mid_section])  # Move horizontally
        dist[mid_section] += np.abs(
            y1[mid_section])  # Move vertically to corner

        # Move to goal!!!
        dist += np.abs(x1 - (-0.5))
        dist += np.abs(y1 - 1.25)

        return dist

    def _medium_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_x = -self.boundary_dist / 3
        left_wall_bottom = self.inner_wall_max_dist
        right_wall_x = self.boundary_dist / 3
        right_wall_top = -self.inner_wall_max_dist

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1, s2 = s1[None], s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [left_wall_bottom, right_wall_top, 0]
        boundary_xs = [left_wall_x, right_wall_x, self.boundary_dist, -4.0001]
        y_directions = [
            1, -1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(3):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

        return x_dist + y_dist

    def _double_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_bottom = self.inner_wall_max_dist + 1
        right_wall_top = -self.inner_wall_max_dist - 1

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1 = s1[None]
        if len(s2.shape) == 1:
            s2 = s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [
            right_wall_top, left_wall_bottom, right_wall_top, left_wall_bottom,
            0
        ]
        boundary_xs = [
            -3 / 5 * self.boundary_dist, -1 / 5 * self.boundary_dist,
            1 / 5 * self.boundary_dist, 3 / 5 * self.boundary_dist,
            self.boundary_dist, -4.0001
        ]
        y_directions = [
            -1, +1, -1, +1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(5):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            #         print(x1)
            #         print(boundary_xs[i])
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            #         print("x1:", x1)
            #         print("x2:", x2)
            #         print(boundary_xs[i])
            #         print(boundary_xs[i-1])
            #         print(curr_in_section, goal_in_section)

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

    #     print(x_dist, y_dist)
        return x_dist + y_dist

    def compute_rewards(self, actions, obs, reward_type=None):
        reward_type = reward_type or self.reward_type

        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if reward_type == "sparse":
            r = -(d > self.target_radius).astype(np.float32)
        elif reward_type == "multi-sparse":
            goals, threshold = self.sparse_goals
            min_dist = np.array([self.compute_min_dist(goals)])
            r = -(min_dist > self.target_radius).astype(np.float32)
        elif reward_type == "dense":
            r = -d
        elif reward_type == "vectorized_dense":
            r = -np.abs(achieved_goals - desired_goals)
        elif reward_type == "medium_maze_ground_truth_manhattan":
            """
            # Use maze distance from current position to goal as the negative reward.
            # This is specifically for the medium Maze-v0 env, which has two vertical walls.
            """
            r = -self._medium_maze_manhattan_distance(achieved_goals,
                                                      desired_goals)
        elif reward_type == "laplace_smoothing":
            # Laplace smoothing: 1 within the goal region, 1/(n+2) at all other states
            # (where n is the number of visitations)
            r = np.zeros(d.shape)
            pos_d = obs['discrete_observation']
            r = 1 / (self.bin_counts[pos_d[:, 0], pos_d[:, 1]] + 2)
            r[d <= self.target_radius] = 1
        elif reward_type == "none":
            r = np.zeros(d.shape)
        else:
            raise NotImplementedError(
                f"Unrecognized reward type: {reward_type}")

        if self.use_count_reward:
            # TODO: Add different count based strategies
            pos_d = obs['discrete_observation']
            self._ext_reward = r.copy()
            self._count_bonus = 1 / np.sqrt(self.bin_counts[pos_d[:, 0],
                                                            pos_d[:, 1]])
            r += self._count_bonus
        else:
            self._ext_reward = r.copy()

        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
                'ext_reward',
                'count_bonus',
        ] + (['is_sparse_success'] if self.sparse_goals is not None else []):
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        # if self.fixed_goal:
        #     goals = np.repeat(
        #         self.fixed_goal.copy()[None],
        #         batch_size,
        #         0)
        # else:
        goals = np.zeros((batch_size, self.obs_range.low.size))
        for b in range(batch_size):
            if batch_size > 1:
                logging.warning("This is very slow!")
            goals[b, :] = self._sample_position(
                # self.obs_range.low,
                # self.obs_range.high,
                self.target_pos_range.low,
                self.target_pos_range.high,
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None, invert_colors=False):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                # TODO: Action scale = 1 breaks rendering, why?
                # x_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                # y_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=self.render_onscreen,
            )

        old_position = self._position
        self._position = np.array([-4, -4])

        self.draw(self.drawer)
        img = self.drawer.get_image()
        # img = skimage.transform.resize(img, (width, height), anti_aliasing=True, preserve_range=True)
        self._position = old_position

        if self.images_are_rgb:
            im = img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            im = (-r + b).transpose().flatten()

        im[0:2, 0:2] = np.array([255, 255, 255])
        im = 255 - im  # Invert image - make background black, walls white
        im = im[None]

        # Set pixel at ball position to blue for each individual observation
        pixel_coords = self._state_to_pixel_coords(self._position,
                                                   img_width=width)[None]
        for i in range(-self.ball_pixel_radius, self.ball_pixel_radius + 1):
            for j in range(-self.ball_pixel_radius,
                           self.ball_pixel_radius + 1):
                im[range(len(pixel_coords)),
                   np.clip(pixel_coords[:, 1] + i, 0, 27),
                   np.clip(pixel_coords[:, 0] + j, 0, 27)] = np.array(
                       [0, 0, 255])

        self._position = old_position
        return im[0] / 255

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color('white'))

        if self.show_discrete_grid:
            for x in self.x_bins:
                drawer.draw_segment((x, -self.boundary_dist),
                                    (x, self.boundary_dist),
                                    Color(220, 220, 220, 25),
                                    aa=False)
            for y in self.y_bins:
                drawer.draw_segment((-self.boundary_dist, y),
                                    (self.boundary_dist, y),
                                    Color(220, 220, 220, 25),
                                    aa=False)

        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        try:
            drawer.draw_solid_circle(
                self._position,
                self.ball_radius,
                Color('blue'),
            )
        except ValueError as e:
            print('\n\n RENDER ERROR \n\n')

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self,
               mode='human',
               width=None,
               height=None,
               invert_colors=False,
               close=False):
        if close:
            self.render_drawer = None
            return
        if mode == 'rgb_array':
            return self.get_image(width=width,
                                  height=height,
                                  invert_colors=invert_colors)

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                # x_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                # y_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # def get_diagnostics(self, paths, prefix=''):
    #     statistics = OrderedDict()
    #     for stat_name in ('distance_to_target', ):
    #         stat_name = stat_name
    #         stat = get_stat_in_paths(paths, 'env_infos', stat_name)
    #         statistics.update(create_stats_ordered_dict(
    #             '%s%s' % (prefix, stat_name),
    #             stat,
    #             always_show_all_stats=True,
    #             ))
    #         statistics.update(create_stats_ordered_dict(
    #             'Final %s%s' % (prefix, stat_name),
    #             [s[-1] for s in stat],
    #             always_show_all_stats=True,
    #             ))
    #     return statistics
    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass