def _compute_relative_local_goals(self, agent_world_pose, M, s): """ Converts a local goal (x, y) position in the map to egocentric coordinates relative to agent's current pose. """ local_map_goals = self.states["curr_local_goals"] local_world_goals = convert_map2world(local_map_goals, (M, M), s) # (bs, 2) # Concatenate dummy directions to goal local_world_goals = torch.cat( [ local_world_goals, torch.zeros(self.nplanners, 1).to(agent_world_pose.device), ], dim=1, ) relative_goals = subtract_pose(agent_world_pose, local_world_goals)[:, :2] return relative_goals
def map_update_fn(ps_args): # Unpack args mapper = ps_args[0] mapper_rollouts = ps_args[1] optimizer = ps_args[2] num_update_batches = ps_args[3] batch_size = ps_args[4] freeze_projection_unit = ps_args[5] bias_factor = ps_args[6] occupancy_anticipator_type = ps_args[7] pose_loss_coef = ps_args[8] max_grad_norm = ps_args[9] label_id = ps_args[10] # Perform update losses = { "total_loss": 0, "mapping_loss": 0, "trans_loss": 0, "rot_loss": 0, } if isinstance(mapper, nn.DataParallel): mapper_config = mapper.module.config else: mapper_config = mapper.config img_mean = mapper_config.NORMALIZATION.img_mean img_std = mapper_config.NORMALIZATION.img_std start_time = time.time() # Debugging map_update_profile = {"data_sampling": 0.0, "pytorch_update": 0.0} for i in range(num_update_batches): start_time_sample = time.time() observations = mapper_rollouts.sample(batch_size) map_update_profile["data_sampling"] += time.time() - start_time_sample # Labels # Pose labels start_time_pyt = time.time() device = observations["pose_gt_at_t_1"].device pose_gt_at_t_1 = observations["pose_gt_at_t_1"] pose_gt_at_t = observations["pose_gt_at_t"] pose_at_t_1 = observations["pose_at_t_1"] pose_at_t = observations["pose_at_t"] dpose_gt = subtract_pose(pose_gt_at_t_1, pose_gt_at_t) # (bs, 3) dpose_noisy = subtract_pose(pose_at_t_1, pose_at_t) # (bs, 3) ddpose_gt = subtract_pose(dpose_noisy, dpose_gt) # Map labels pt_gt = observations[f"{label_id}_at_t"] # (bs, V, V, 2) pt_gt = rearrange(pt_gt, "b h w c -> b c h w") # (bs, 2, V, V) # Forward pass mapper_inputs = observations mapper_outputs = mapper(mapper_inputs, method_name="predict_deltas") pt_hat = mapper_outputs["pt"] # Compute losses # -------- mapping loss --------- mapping_loss = simple_mapping_loss_fn(pt_hat, pt_gt) if freeze_projection_unit: mapping_loss = mapping_loss.detach() if occupancy_anticipator_type == "rgb_model_v2": ego_map_gt = observations["ego_map_gt_at_t"] # (bs, V, V, 2) ego_map_gt = rearrange(ego_map_gt, "b h w c -> b c h w") ego_map_hat = mapper_outputs["all_pu_outputs"]["depth_proj_estimate"] mapping_loss = mapping_loss + simple_mapping_loss_fn( ego_map_hat, ego_map_gt ) all_pose_outputs = mapper_outputs["all_pose_outputs"] if all_pose_outputs is None: pose_estimation_loss = torch.zeros([0]).to(device).sum() trans_loss = torch.zeros([0]).to(device).sum() rot_loss = torch.zeros([0]).to(device).sum() else: pose_outputs = all_pose_outputs["pose_outputs"] pose_estimation_loss, trans_loss, rot_loss = 0, 0, 0 n_outputs = len(list(pose_outputs.keys())) # The pose prediction outputs are performed for individual modalities, # and then weighted-averaged according to an ensemble MLP. # Here, the loss is computed for each modality as well as the ensemble. # Finally, it is averaged across the modalities. pose_label = ddpose_gt for _, dpose_hat in pose_outputs.items(): curr_pose_losses = pose_loss_fn(dpose_hat, pose_label) pose_estimation_loss = pose_estimation_loss + curr_pose_losses[0] trans_loss = trans_loss + curr_pose_losses[1] rot_loss = rot_loss + curr_pose_losses[2] pose_estimation_loss = pose_estimation_loss / n_outputs trans_loss = trans_loss / n_outputs rot_loss = rot_loss / n_outputs optimizer.zero_grad() total_loss = mapping_loss + pose_estimation_loss * pose_loss_coef # Backward pass total_loss.backward() # Update nn.utils.clip_grad_norm_(mapper.parameters(), max_grad_norm) optimizer.step() losses["total_loss"] += total_loss.item() losses["mapping_loss"] += mapping_loss.item() losses["trans_loss"] += trans_loss.item() losses["rot_loss"] += rot_loss.item() map_update_profile["pytorch_update"] += time.time() - start_time_pyt time_per_step = (time.time() - start_time) / (60 * (i + 1)) losses["pose_loss"] = losses["trans_loss"] + losses["rot_loss"] for k in losses.keys(): losses[k] /= num_update_batches return losses
def predict_deltas(self, x, masks=None): # Transpose multichannel inputs st_1 = process_image(x["rgb_at_t_1"], self.img_mean_t, self.img_std_t) dt_1 = transpose_image(x["depth_at_t_1"]) ego_map_gt_at_t_1 = transpose_image(x["ego_map_gt_at_t_1"]) st = process_image(x["rgb_at_t"], self.img_mean_t, self.img_std_t) dt = transpose_image(x["depth_at_t"]) ego_map_gt_at_t = transpose_image(x["ego_map_gt_at_t"]) # This happens only for a baseline if ("ego_map_gt_anticipated_at_t_1" in x and x["ego_map_gt_anticipated_at_t_1"] is not None): ego_map_gt_anticipated_at_t_1 = transpose_image( x["ego_map_gt_anticipated_at_t_1"]) ego_map_gt_anticipated_at_t = transpose_image( x["ego_map_gt_anticipated_at_t"]) else: ego_map_gt_anticipated_at_t_1 = None ego_map_gt_anticipated_at_t = None # Compute past and current egocentric maps bs = st_1.size(0) pu_inputs_t_1 = { "rgb": st_1, "depth": dt_1, "ego_map_gt": ego_map_gt_at_t_1, "ego_map_gt_anticipated": ego_map_gt_anticipated_at_t_1, } pu_inputs_t = { "rgb": st, "depth": dt, "ego_map_gt": ego_map_gt_at_t, "ego_map_gt_anticipated": ego_map_gt_anticipated_at_t, } pu_inputs = self._safe_cat(pu_inputs_t_1, pu_inputs_t) pu_outputs = self.projection_unit(pu_inputs) pu_outputs_t = {k: v[bs:] for k, v in pu_outputs.items()} pt_1, pt = pu_outputs["occ_estimate"][:bs], pu_outputs["occ_estimate"][ bs:] # Compute relative pose dx = subtract_pose(x["pose_at_t_1"], x["pose_at_t"]) # Estimate pose dx_hat = dx xt_hat = x["pose_at_t"] all_pose_outputs = None if not self.config.ignore_pose_estimator: all_pose_outputs = {} pose_inputs = {} if "rgb" in self.config.pose_predictor_inputs: pose_inputs["rgb_t_1"] = st_1 pose_inputs["rgb_t"] = st if "depth" in self.config.pose_predictor_inputs: pose_inputs["depth_t_1"] = dt_1 pose_inputs["depth_t"] = dt if "ego_map" in self.config.pose_predictor_inputs: pose_inputs["ego_map_t_1"] = pt_1 pose_inputs["ego_map_t"] = pt if self.config.detach_map: for k in pose_inputs.keys(): pose_inputs[k] = pose_inputs[k].detach() n_pose_inputs = self._transform_observations(pose_inputs, dx) pose_outputs = self.pose_estimator(n_pose_inputs) dx_hat = add_pose(dx, pose_outputs["pose"]) all_pose_outputs["pose_outputs"] = pose_outputs # Estimate global pose xt_hat = add_pose(x["pose_hat_at_t_1"], dx_hat) # Zero out pose prediction based on the mask if masks is not None: xt_hat = xt_hat * masks dx_hat = dx_hat * masks outputs = { "pt": pt, "dx_hat": dx_hat, "xt_hat": xt_hat, "all_pu_outputs": pu_outputs_t, "all_pose_outputs": all_pose_outputs, } if "ego_map_hat" in pu_outputs_t: outputs["ego_map_hat_at_t"] = pu_outputs_t["ego_map_hat"] return outputs
def _get_wall_occupancy(self, episode, agent_state): episode_id = (episode.episode_id, episode.scene_id) # Load the episode specific maps only if the episode has changed if self.current_wall_episode_id != episode_id: self.current_wall_episode_id = episode_id if self.config.GT_TYPE == "wall_occupancy": scene_id = episode.scene_id.split("/")[-1] self._scene_maps_info = self._all_maps_info[scene_id] # Load the maps per floor seen_maps, wall_maps = self._load_transformed_wall_maps( self._scene_maps_info, episode, ) self._scene_maps = {} self._scene_maps["seen_maps"] = seen_maps self._scene_maps["wall_maps"] = wall_maps agent_state = self._sim.get_agent_state() current_height = agent_state.position[1] best_floor_idx = None best_floor_dist = math.inf for floor_idx, floor_data in enumerate(self._scene_maps_info): floor_height = floor_data["floor_height"] if abs(current_height - floor_height) < best_floor_dist: best_floor_idx = floor_idx best_floor_dist = abs(current_height - floor_height) assert best_floor_idx is not None current_wall_map = self._scene_maps["wall_maps"][best_floor_idx] # Take only channel 0 as both channels have save values current_wall_map = current_wall_map[..., 0] # ========= Get local egocentric crop of the current wall map ========= # Compute relative pose of agent from start location start_position = episode.start_position # (X, Y, Z) start_rotation = quaternion_xyzw_to_wxyz(episode.start_rotation) start_heading = compute_heading_from_quaternion(start_rotation) start_pose = torch.Tensor( [[-start_position[2], start_position[0], start_heading]] ) agent_position = agent_state.position agent_heading = compute_heading_from_quaternion(agent_state.rotation) agent_pose = torch.Tensor( [[-agent_position[2], agent_position[0], agent_heading]] ) rel_pose = subtract_pose(start_pose, agent_pose)[0] # (3,) # Compute agent position on the map image map_scale = self.config.MAP_SCALE H, W = current_wall_map.shape[:2] Hby2, Wby2 = (H + 1) // 2, (W + 1) // 2 agent_map_x = int(rel_pose[1].item() / map_scale + Wby2) agent_map_y = int(-rel_pose[0].item() / map_scale + Hby2) # Crop the region around the agent. mrange = int(1.5 * self.map_size) # Add extra padding if map range is coordinates go out of bounds y_start = agent_map_y - mrange y_end = agent_map_y + mrange x_start = agent_map_x - mrange x_end = agent_map_x + mrange x_l_pad, y_l_pad, x_r_pad, y_r_pad = 0, 0, 0, 0 H, W = current_wall_map.shape if x_start < 0: x_l_pad = int(-x_start) x_start += x_l_pad x_end += x_l_pad if x_end >= W: x_r_pad = int(x_end - W + 1) if y_start < 0: y_l_pad = int(-y_start) y_start += y_l_pad y_end += y_l_pad if y_end >= H: y_r_pad = int(y_end - H + 1) ego_map = np.pad(current_wall_map, ((y_l_pad, y_r_pad), (x_l_pad, x_r_pad))) ego_map = ego_map[y_start: (y_end + 1), x_start: (x_end + 1)] agent_heading = rel_pose[2].item() agent_heading = math.degrees(agent_heading) half_size = ego_map.shape[0] // 2 center = (half_size, half_size) M = cv2.getRotationMatrix2D(center, agent_heading, scale=1.0) ego_map = cv2.warpAffine( ego_map, M, (ego_map.shape[1], ego_map.shape[0]), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=(1,), ) ego_map = ego_map.astype(np.float32) mrange = int(self.map_size) ego_map = ego_map[ (half_size - mrange): (half_size + mrange), (half_size - mrange): (half_size + mrange), ] # Get forward region infront of the agent half_size = ego_map.shape[0] // 2 quarter_size = ego_map.shape[0] // 4 center = (half_size, half_size) ego_map = ego_map[0:half_size, quarter_size: (quarter_size + half_size)] # Append explored status in the 2nd channel ego_map = np.stack([ego_map, ego_map], axis=2) return ego_map