def draw_agent(self,
                map,
                agent_center_coord,
                agent_rotation,
                agent_radius_px=5):
     maps.draw_agent(map,
                     agent_center_coord,
                     agent_rotation,
                     agent_radius_px=agent_radius_px)
     return map
Exemplo n.º 2
0
def draw_top_down_map(info, heading, output_size):
    """Generates a map that displays the state of the agent in the given environment,
    for the current frame.

    Args:
        info: environment info for current frame.
        heading: where the agent heads toward.
        output_size: height of output map.
    Returns:
        the output_size x width x 1 map
    """
    top_down_map = maps.colorize_topdown_map(
        info["top_down_map"]["map"], info["top_down_map"]["fog_of_war_mask"])
    original_map_size = top_down_map.shape[:2]
    map_scale = np.array(
        (1, original_map_size[1] * 1.0 / original_map_size[0]))
    new_map_size = np.round(output_size * map_scale).astype(np.int32)
    # OpenCV expects w, h but map size is in h, w
    top_down_map = cv2.resize(top_down_map, (new_map_size[1], new_map_size[0]))

    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    map_agent_pos = np.round(map_agent_pos * new_map_size /
                             original_map_size).astype(np.int32)
    top_down_map = maps.draw_agent(
        top_down_map,
        map_agent_pos,
        heading - np.pi / 2,
        agent_radius_px=top_down_map.shape[0] / 40,
    )
    return top_down_map
Exemplo n.º 3
0
def generate_map_frame(env, obs_dim):
    """Generates a map that displays the state of the agent in the given environment,
    for the current frame.

    Args:
        env: sim environment.
        obs_dim: dimension of observations.
    Returns:
        the height x width x 1 map
    """
    # draw map
    top_down_map = maps.get_topdown_map(sim=env.sim,
                                        map_resolution=(1250, 1250, 1),
                                        num_samples=200)
    # draw agent
    # TODO: figure out how to convert quaternion to angles to get agent_rotation
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=(1, 1),
        agent_rotation=0.0,
        agent_radius_px=8,
    )
    # scale top down map to align with rgb view
    old_h, old_w, _ = top_down_map.shape
    top_down_height = obs_dim[0]
    top_down_width = int(float(top_down_height) / old_h * old_w)
    # cv2 resize (dsize is width first)
    top_down_map = cv2.resize(
        top_down_map,
        (top_down_width, top_down_height),
        interpolation=cv2.INTER_CUBIC,
    )
    top_down_map = np.expand_dims(top_down_map, axis=2)

    return top_down_map
Exemplo n.º 4
0
def observations_to_image(observation: Dict, info: Dict) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        egocentric_view.append(observation["rgb"][:, :, :3])

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = (observation["depth"].squeeze() * 255).astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (len(egocentric_view) >
            0), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)
    return frame
Exemplo n.º 5
0
def draw_top_down_map(info):
    top_down_map = info["top_down_map"]["map"]

    top_down_map = maps.colorize_topdown_map(top_down_map)
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=top_down_map.shape[0] // 25,
    )

    return top_down_map
Exemplo n.º 6
0
def plot_top_down_map(info, dataset='replica', pred=None):
    top_down_map = info["top_down_map"]["map"]
    top_down_map = maps.colorize_topdown_map(
        top_down_map, info["top_down_map"]["fog_of_war_mask"])
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    if dataset == 'replica':
        agent_radius_px = top_down_map.shape[0] // 16
    else:
        agent_radius_px = top_down_map.shape[0] // 50
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=agent_radius_px)
    if pred is not None:
        from habitat.utils.geometry_utils import quaternion_rotate_vector

        source_rotation = info["top_down_map"]["agent_rotation"]

        rounded_pred = np.round(pred[1])
        direction_vector_agent = np.array(
            [rounded_pred[1], 0, -rounded_pred[0]])
        direction_vector = quaternion_rotate_vector(source_rotation,
                                                    direction_vector_agent)

        grid_size = (
            (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
            (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
        )
        delta_x = int(-direction_vector[0] / grid_size[0])
        delta_y = int(direction_vector[2] / grid_size[1])

        x = np.clip(map_agent_pos[0] + delta_x,
                    a_min=0,
                    a_max=top_down_map.shape[0])
        y = np.clip(map_agent_pos[1] + delta_y,
                    a_min=0,
                    a_max=top_down_map.shape[1])
        point_padding = 20
        for m in range(x - point_padding, x + point_padding + 1):
            for n in range(y - point_padding, y + point_padding + 1):
                if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \
                        0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]:
                    top_down_map[m, n] = (0, 255, 255)
        if np.linalg.norm(rounded_pred) < 1:
            assert delta_x == 0 and delta_y == 0

    if top_down_map.shape[0] > top_down_map.shape[1]:
        top_down_map = np.rot90(top_down_map, 1)
    return top_down_map
Exemplo n.º 7
0
def plot_top_down_map(info, dataset='replica'):
    top_down_map = info["top_down_map"]["map"]
    top_down_map = maps.colorize_topdown_map(
        top_down_map, info["top_down_map"]["fog_of_war_mask"])
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    if dataset == 'replica':
        agent_radius_px = top_down_map.shape[0] // 16
    else:
        agent_radius_px = top_down_map.shape[0] // 50
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=agent_radius_px)

    if top_down_map.shape[0] > top_down_map.shape[1]:
        top_down_map = np.rot90(top_down_map, 1)
    return top_down_map
def draw_top_down_map(info, heading, output_size):
    top_down_map = maps.colorize_topdown_map(info["top_down_map"]["map"])
    original_map_size = top_down_map.shape[:2]
    map_scale = np.array(
        (1, original_map_size[1] * 1.0 / original_map_size[0]))
    new_map_size = np.round(output_size * map_scale).astype(np.int32)
    # OpenCV expects w, h but map size is in h, w
    top_down_map = cv2.resize(top_down_map, (new_map_size[1], new_map_size[0]))

    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    map_agent_pos = np.round(map_agent_pos * new_map_size /
                             original_map_size).astype(np.int32)
    top_down_map = maps.draw_agent(
        top_down_map,
        map_agent_pos,
        heading - np.pi / 2,
        agent_radius_px=top_down_map.shape[0] / 40,
    )
    return top_down_map
Exemplo n.º 9
0
def topdown_to_image(topdown_info: np.ndarray) -> np.ndarray:
    r"""Convert topdown map to an RGB image.
    """
    top_down_map = topdown_info["map"]
    fog_of_war_mask = topdown_info["fog_of_war_mask"]
    top_down_map = maps.colorize_topdown_map(top_down_map, fog_of_war_mask)
    map_agent_pos = topdown_info["agent_map_coord"]

    # Add zero padding
    min_map_size = 200
    if top_down_map.shape[0] != top_down_map.shape[1]:
        H = top_down_map.shape[0]
        W = top_down_map.shape[1]
        if H > W:
            pad_value = (H - W) // 2
            padding = ((0, 0), (pad_value, pad_value), (0, 0))
            map_agent_pos = (map_agent_pos[0], map_agent_pos[1] + pad_value)
        else:
            pad_value = (W - H) // 2
            padding = ((pad_value, pad_value), (0, 0), (0, 0))
            map_agent_pos = (map_agent_pos[0] + pad_value, map_agent_pos[1])
        top_down_map = np.pad(top_down_map,
                              padding,
                              mode="constant",
                              constant_values=255)

    if top_down_map.shape[0] < min_map_size:
        H, W = top_down_map.shape[:2]
        top_down_map = cv2.resize(top_down_map, (min_map_size, min_map_size))
        map_agent_pos = (
            int(map_agent_pos[0] * min_map_size // H),
            int(map_agent_pos[1] * min_map_size // W),
        )
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=topdown_info["agent_angle"],
        agent_radius_px=top_down_map.shape[0] // 16,
    )

    return top_down_map
Exemplo n.º 10
0
                    path_point[2],
                    path_point[0],
                    grid_dimensions,
                    pathfinder=sim.pathfinder,
                )
                for path_point in path_points
            ]
            grid_tangent = mn.Vector2(
                trajectory[1][1] - trajectory[0][1], trajectory[1][0] - trajectory[0][0]
            )
            path_initial_tangent = grid_tangent / grid_tangent.length()
            initial_angle = math.atan2(path_initial_tangent[0], path_initial_tangent[1])
            # draw the agent and trajectory on the map
            maps.draw_path(top_down_map, trajectory)
            maps.draw_agent(
                top_down_map, trajectory[0], initial_angle, agent_radius_px=8
            )
            print("\nDisplay the map with agent and path overlay:")
            display_map(top_down_map)

        # @markdown 4. (optional) Place agent and render images at trajectory points (if found).
        display_path_agent_renders = True  # @param{type:"boolean"}
        if display_path_agent_renders:
            print("Rendering observations at path points:")
            tangent = path_points[1] - path_points[0]
            agent_state = habitat_sim.AgentState()
            for ix, point in enumerate(path_points):
                if ix < len(path_points) - 1:
                    tangent = path_points[ix + 1] - point
                    agent_state.position = point
                    tangent_orientation_matrix = mn.Matrix4.look_at(
Exemplo n.º 11
0
def observations_to_image(observation: Dict, info: Dict) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    observation_size = observation["rgb"].shape[0]
    egocentric_view = observation["rgb"][:, :, -3:]
    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    if "goal_coord_in_camera" in observation:
        _, _, _, xpx, ypx = observation["goal_coord_in_camera"]

        if xpx != -1 and ypx != -1:

            xpx = int(xpx * observation_size + observation_size / 2)
            ypx = int(ypx * observation_size + observation_size / 2)

            egocentric_view = cv2.circle(egocentric_view, (xpx, ypx), 15,
                                         (0, 0, 255), 5)

    # draw depth map if observation has depth info
    if "depth" in observation:
        depth_map = (observation["depth"][:, :, -1] * 255).astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)

        egocentric_view = np.concatenate((egocentric_view, depth_map), axis=1)

    if "goalclass" in observation:
        from habitat.tasks.nav.nav_task_multi_goal import CLASSES
        index = np.nonzero(observation["goalclass"])[0][0]
        classes = list(CLASSES.keys())
        class_name = classes[index]
        cv2.putText(egocentric_view, class_name, (15, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255))

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)

    return frame
Exemplo n.º 12
0
def obs_to_images(obs):
    img = obs["rgb"].copy()
    images = [img.transpose(0, 2, 3, 1)]

    # Draw top down view
    if "visited_grid" in obs:
        top_down_map = obs["visited_grid"][0, ...]
    elif "top_down_map" in obs:
        top_down_map = maps.colorize_topdown_map(obs["top_down_map"]["map"])
        map_size = 1024
        original_map_size = top_down_map.shape[:2]
        if original_map_size[0] > original_map_size[1]:
            map_scale = np.array(
                (1, original_map_size[1] * 1.0 / original_map_size[0]))
        else:
            map_scale = np.array(
                (original_map_size[0] * 1.0 / original_map_size[1], 1))

        new_map_size = np.round(map_size * map_scale).astype(np.int32)
        # OpenCV expects w, h but map size is in h, w
        top_down_map = cv2.resize(top_down_map,
                                  (new_map_size[1], new_map_size[0]))

        map_agent_pos = obs["top_down_map"]["agent_map_coord"]
        map_agent_pos = np.round(map_agent_pos * new_map_size /
                                 original_map_size).astype(np.int32)
        top_down_map = maps.draw_agent(top_down_map,
                                       map_agent_pos,
                                       obs["heading"] - np.pi / 2,
                                       agent_radius_px=top_down_map.shape[0] /
                                       40)
    else:
        top_down_map = None

    normalize = [True]
    titles = [(
        ("Method: %s" % obs["method"].replace("_", " ")),
        ("Step: %03d Reward: %.3f" %
         (obs["step"][0], obs.get("reward", [0])[0])),
        ("Action: %s" %
         string.capwords(obs["action_taken_name"].replace("_", " "))),
    )]
    images.append(top_down_map)
    if "visited" in obs:
        titles.append((("Visited Cube Count:  %d" % obs["visited"][0]), ))
    elif "distance_from_start" in obs:
        titles.append("Geo Dist From Origin: %.3f" %
                      obs["distance_from_start"][0])
    elif "pointgoal" in obs:
        titles.append((("Euc Dist:  %.3f" % obs["pointgoal"][0, 0]),
                       ("Geo Dist: %.3f" % obs["goal_geodesic_distance"][0])))

    normalize.append(False)

    for key, val in obs.items():
        if key == "depth" or key == "output_depth":
            normalize.append(False)
            val = val[:, 0, ...]
            depth = np.clip(val, -0.5, 0.5)
            depth += 0.5
            depth *= 255
            titles.append(key)
            depth = depth.astype(np.uint8)
            depth = np.reshape(depth, (-1, depth.shape[-1]))
            images.append(depth)
        elif key == "surface_normals" or key == "output_surface_normals":
            titles.append(key)
            normalize.append(False)
            val = val.copy()
            if key == "output_surface_normals":
                # Still need to be normalized
                val /= np.sqrt(np.sum(val**2, axis=1, keepdims=True))
            surfnorm = (np.clip(
                (val + 1), 0, 2) * 127).astype(np.uint8).transpose(
                    (0, 2, 3, 1))
            images.append(surfnorm)
        elif key == "semantic":
            titles.append(key)
            normalize.append(False)
            seg = (val * 314.159 % 255).astype(np.uint8)
            seg = np.reshape(seg, (-1, seg.shape[-1]))
            images.append(seg)
        elif key == "output_reconstruction":
            titles.append(key)
            normalize.append(False)
            val = np.clip(val, -0.5, 0.5)
            val += 0.5
            val *= 255
            val = val.astype(np.uint8).transpose((0, 2, 3, 1))
            images.append(val)
        elif key in {
                "action_prob", "action_taken", "egomotion_pred",
                "best_next_action"
        }:
            if key == "action_prob":
                titles.append(("Output Distribution",
                               "p(Forward)     p(Left)     p(Right)"))
            else:
                titles.append(key)
            if val is not None:
                normalize.append(True)
                prob_hists = np.concatenate(
                    [draw_probability_hist(pi) for pi in val.copy()], axis=0)
                images.append(prob_hists)

            else:
                images.append(None)
                normalize.append(False)
    images.append(top_down_map)
    normalize.append(True)
    titles = [
        string.capwords(title.replace("_", " "))
        if isinstance(title, str) else title for title in titles
    ]
    return images, titles, normalize
Exemplo n.º 13
0
def observations_to_image(observation: Dict, info: Dict, reward, weights_output=None, aux_tasks=[]) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().
        reward: float to append
        weights_output: attention weights for viz

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        rgb = observation["rgb"]
        if not isinstance(rgb, np.ndarray):
            rgb = rgb.cpu().numpy()

        egocentric_view.append(rgb)

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = observation["depth"].squeeze() * 255.0
        if not isinstance(depth_map, np.ndarray):
            depth_map = depth_map.cpu().numpy()

        depth_map = depth_map.astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (
        len(egocentric_view) > 0
    ), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"]
        )
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)

    if weights_output is not None and len(aux_tasks) > 1:
        # add a strip to the right of the video
        strip_height = observation_size # ~256 -> we'll have 5-10 tasks, let's do 24 pixels each
        strip_gap = 24
        strip_width = strip_gap + 12
        strip = np.ones((strip_height, strip_width, 3), dtype=np.uint8) * 255 # white bg

        num_tasks = weights_output.size(0)
        total_height = num_tasks * strip_gap
        offset = int((strip_height - total_height)/2)
        assert offset > 0, "too many aux tasks to visualize"
        for i in range(num_tasks):
            start_height = i * strip_gap + offset
            strength = int(255 * weights_output[i])
            color = np.array([strength, 0, 0])
            if weights_output[i] > 1.001:
                raise Exception(f"weights is {weights_output}, that's too big")
            strip[start_height: start_height + strip_gap] = color

            task_name = AUX_ABBREV.get(aux_tasks[i], aux_tasks[i])
            task_abbrev = task_name[:3]
            cv2.putText(img=strip,
                text=f"{task_abbrev}", org=(2, int(start_height + strip_gap / 2)),
                fontFace=2, fontScale=.4, color=(256, 256, 256), thickness=1)
        frame = np.concatenate((frame, strip), axis=1)
    return frame
Exemplo n.º 14
0
def observations_to_image(observation: Dict,
                          info: Dict,
                          pred=None) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        rgb = observation["rgb"]
        if not isinstance(rgb, np.ndarray):
            rgb = rgb.cpu().numpy()

        egocentric_view.append(rgb)

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = observation["depth"].squeeze() * 255.0
        if not isinstance(depth_map, np.ndarray):
            depth_map = depth_map.cpu().numpy()

        depth_map = depth_map.astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (len(egocentric_view) >
            0), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )
        if pred is not None:
            from habitat.utils.geometry_utils import quaternion_rotate_vector

            # current_position = sim.get_agent_state().position
            # agent_state = sim.get_agent_state()
            source_rotation = info["top_down_map"]["agent_rotation"]

            rounded_pred = np.round(pred[1])
            direction_vector_agent = np.array(
                [rounded_pred[1], 0, -rounded_pred[0]])
            direction_vector = quaternion_rotate_vector(
                source_rotation, direction_vector_agent)
            # pred_goal_location = source_position + direction_vector.astype(np.float32)

            grid_size = (
                (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
                (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
            )
            delta_x = int(-direction_vector[0] / grid_size[0])
            delta_y = int(direction_vector[2] / grid_size[1])

            x = np.clip(map_agent_pos[0] + delta_x,
                        a_min=0,
                        a_max=top_down_map.shape[0])
            y = np.clip(map_agent_pos[1] + delta_y,
                        a_min=0,
                        a_max=top_down_map.shape[1])
            point_padding = 12
            for m in range(x - point_padding, x + point_padding + 1):
                for n in range(y - point_padding, y + point_padding + 1):
                    if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \
                            0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]:
                        top_down_map[m, n] = (0, 255, 255)
            if np.linalg.norm(rounded_pred) < 1:
                assert delta_x == 0 and delta_y == 0

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        if pred is None:
            old_h, old_w, _ = top_down_map.shape
            top_down_height = observation_size
            top_down_width = int(float(top_down_height) / old_h * old_w)
            # cv2 resize (dsize is width first)
            top_down_map = cv2.resize(
                top_down_map.astype(np.float32),
                (top_down_width, top_down_height),
                interpolation=cv2.INTER_CUBIC,
            )
        else:
            # draw label
            CATEGORY_INDEX_MAPPING = {
                'chair': 0,
                'table': 1,
                'picture': 2,
                'cabinet': 3,
                'cushion': 4,
                'sofa': 5,
                'bed': 6,
                'chest_of_drawers': 7,
                'plant': 8,
                'sink': 9,
                'toilet': 10,
                'stool': 11,
                'towel': 12,
                'tv_monitor': 13,
                'shower': 14,
                'bathtub': 15,
                'counter': 16,
                'fireplace': 17,
                'gym_equipment': 18,
                'seating': 19,
                'clothes': 20
            }
            index2label = {v: k for k, v in CATEGORY_INDEX_MAPPING.items()}
            pred_label = index2label[pred[0]]
            text_height = int(observation_size * 0.1)

            old_h, old_w, _ = top_down_map.shape
            top_down_height = observation_size - text_height
            top_down_width = int(float(top_down_height) / old_h * old_w)
            # cv2 resize (dsize is width first)
            top_down_map = cv2.resize(
                top_down_map.astype(np.float32),
                (top_down_width, top_down_height),
                interpolation=cv2.INTER_CUBIC,
            )

            top_down_map = np.concatenate([
                np.ones([text_height, top_down_map.shape[1], 3],
                        dtype=np.int32) * 255, top_down_map
            ],
                                          axis=0)
            top_down_map = cv2.putText(top_down_map,
                                       'C_t: ' + pred_label.replace('_', ' '),
                                       (10, text_height - 10),
                                       cv2.FONT_HERSHEY_SIMPLEX, 1.4,
                                       (0, 0, 0), 2, cv2.LINE_AA)

        frame = np.concatenate((egocentric_view, top_down_map), axis=1)
    return frame