def _collect_rollout_step(self, rollouts, current_episode_reward, episode_rewards, episode_counts): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } ( values, actions, actions_log_probs, recurrent_hidden_states, ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations) rewards = torch.tensor(rewards, dtype=torch.float) rewards = rewards.unsqueeze(1) masks = torch.tensor([[0.0] if done else [1.0] for done in dones], dtype=torch.float) current_episode_reward += rewards episode_rewards += (1 - masks) * current_episode_reward episode_counts += 1 - masks current_episode_reward *= masks rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def _prepare_batch(self, observations, i, device=None, actions=None): imH, imW = self.config.RL.ANS.image_scale_hw device = self.device if device is None else device batch = batch_obs(observations, device=device) if batch["rgb"].size(1) != imH or batch["rgb"].size(2) != imW: rgb = rearrange(batch["rgb"], "b h w c -> b c h w") rgb = F.interpolate(rgb, (imH, imW), mode="bilinear") batch["rgb"] = rearrange(rgb, "b c h w -> b h w c") if batch["depth"].size(1) != imH or batch["depth"].size(2) != imW: depth = rearrange(batch["depth"], "b h w c -> b c h w") depth = F.interpolate(depth, (imH, imW), mode="bilinear") batch["depth"] = rearrange(depth, "b c h w -> b h w c") # Compute ego_map_gt from depth ego_map_gt_b = self.depth_projection_net( rearrange(batch["depth"], "b h w c -> b c h w")) batch["ego_map_gt"] = rearrange(ego_map_gt_b, "b c h w -> b h w c") if actions is None: batch["prev_actions"] = torch.zeros(1, 1).to(self.device) else: batch["prev_actions"] = actions return batch
def _prepare_batch(self, observations, device=None, actions=None): imH, imW = self.config.RL.ANS.image_scale_hw device = self.device if device is None else device batch = batch_obs(observations, device=device) if batch["rgb"].size(1) != imH or batch["rgb"].size(2) != imW: rgb = rearrange(batch["rgb"], "b h w c -> b c h w") rgb = F.interpolate(rgb, (imH, imW), mode="bilinear") batch["rgb"] = rearrange(rgb, "b c h w -> b h w c") if batch["depth"].size(1) != imH or batch["depth"].size(2) != imW: depth = rearrange(batch["depth"], "b h w c -> b c h w") depth = F.interpolate(depth, (imH, imW), mode="nearest") batch["depth"] = rearrange(depth, "b c h w -> b h w c") # Compute ego_map_gt from depth ego_map_gt_b = self.depth_projection_net( rearrange(batch["depth"], "b h w c -> b c h w") ) batch["ego_map_gt"] = rearrange(ego_map_gt_b, "b c h w -> b h w c") # Add previous action to batch as well batch["prev_actions"] = self.prev_actions # Add a rough pose estimate if GT pose is not available if "pose" not in batch: if self.prev_batch is None: # Set initial pose estimate to zero batch["pose"] = torch.zeros(self.envs.num_envs, 3).to(self.device) else: actions_delta = self._convert_actions_to_delta(self.prev_actions) batch["pose"] = add_pose(self.prev_batch["pose"], actions_delta) return batch
def _collect_rollout_step(self, current_episode_reward, running_episode_stats): ''' obss : "rgb" "depth" "pointgoal_with_gps_compass" ''' ### PASS DATA with torch.no_grad(): obs = { k: v[self.rollout.step] for k, v in self.rollout.observations.items() } hidden_states = self.rollout.recurrent_hidden_states[ self.rollout.step] inverse_dones = self.rollout.masks[self.rollout.step] (actions_log_probs, actions, value, distributions, rnn_hidden_state) = (self.actor_critic.act( obs, hidden_states, inverse_dones)) ### PASS ENV res = self.envs.step([action.item() for action in actions]) ### Process observations, rewards, dones, infos = [list(x) for x in zip(*res)] observations = batch_obs(observations, self.device) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(-1) dones = torch.tensor(dones, dtype=torch.float, device=self.device) inverse_dones = torch.tensor([[0] if done else [1] for done in dones], dtype=torch.float, device=self.device) self.rollout.insert( observations, rnn_hidden_state, actions, actions_log_probs, value, rewards, inverse_dones, ) # print("rewards:", rewards) # print("dones:", dones) current_episode_reward += rewards running_episode_stats["reward"] += ( 1 - inverse_dones) * current_episode_reward running_episode_stats["count"] += 1 - inverse_dones for k, v in self._extract_scalars_from_infos(infos).items(): v = torch.tensor(v, dtype=torch.float, device=current_episode_reward.device).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"]) running_episode_stats[k] += (1 - inverse_dones) * v current_episode_reward *= inverse_dones return
def step(self, actions): actions_list = [a[0].item() for a in actions] outputs = self._envs.step(actions_list) obs, rewards, done, info = [list(x) for x in zip(*outputs)] obs = batch_obs(obs) rewards = torch.tensor(rewards, dtype=torch.float) rewards = rewards.unsqueeze(1) return obs, rewards, done, info
def act(self, observations): batch = batch_obs([observations], device=self.device) with torch.no_grad(): _, action, _, self.test_recurrent_hidden_states = self.actor_critic.act( batch, self.test_recurrent_hidden_states, self.prev_actions, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks.fill_(1.0) self.prev_actions.copy_(action) return action.item()
def act(self, observations): batch = batch_obs([observations]) for sensor in batch: batch[sensor] = batch[sensor].to(self.device) with torch.no_grad(): _, actions, _, self.test_recurrent_hidden_states = self.actor_critic.act( batch, self.test_recurrent_hidden_states, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks = torch.ones(1, 1, device=self.device) return actions[0][0].item()
def act(self, observations): batch = batch_obs([observations], device=self.device) with torch.no_grad(): ( _, actions, _, self.test_recurrent_hidden_states, ) = self.actor_critic.act( batch, self.test_recurrent_hidden_states, self.prev_actions, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks = torch.ones(1, 1, device=self.device) self.prev_actions.copy_(actions) return {"action": actions[0][0].item()}
def _prepare_batch(self, observations, prev_batch=None, device=None, actions=None): imH, imW = self.config.RL.ANS.image_scale_hw device = self.device if device is None else device batch = batch_obs(observations, device=device) if batch["rgb"].size(1) != imH or batch["rgb"].size(2) != imW: rgb = rearrange(batch["rgb"], "b h w c -> b c h w") rgb = F.interpolate(rgb, (imH, imW), mode="bilinear") batch["rgb"] = rearrange(rgb, "b c h w -> b h w c") if batch["depth"].size(1) != imH or batch["depth"].size(2) != imW: depth = rearrange(batch["depth"], "b h w c -> b c h w") depth = F.interpolate(depth, (imH, imW), mode="nearest") batch["depth"] = rearrange(depth, "b c h w -> b h w c") # Compute ego_map_gt from depth ego_map_gt_b = self.depth_projection_net( rearrange(batch["depth"], "b h w c -> b c h w") ) batch["ego_map_gt"] = rearrange(ego_map_gt_b, "b c h w -> b h w c") # Compute ego_map_dilation_gt from ego_map_gt batch["ego_map_dilation_gt"] = rearrange(dilate_tensor(ego_map_gt_b,31), "b c h w -> b h w c") if actions is None: # Initialization condition # If pose estimates are not available, set the initial estimate to zeros. if "pose" not in batch: # Set initial pose estimate to zero batch["pose"] = torch.zeros(self.envs.num_envs, 3).to(self.device) batch["prev_actions"] = torch.zeros(self.envs.num_envs, 1).to(self.device) else: # Rollouts condition # If pose estimates are not available, compute them from action taken. if "pose" not in batch: assert prev_batch is not None actions_delta = self._convert_actions_to_delta(actions) batch["pose"] = add_pose(prev_batch["pose"], actions_delta) batch["prev_actions"] = actions return batch
def act(self, observations): observations["pointgoal"] = self.convertPolarToCartesian( observations["pointgoal"]) observations["depth"] = self.convertMaxDepth(observations["depth"]) batch = batch_obs([observations], device=self.device) batch["visual_features"] = self._encoder(batch) if self.prev_visual_features == None: batch["prev_visual_features"] = torch.zeros_like( batch["visual_features"]) else: batch["prev_visual_features"] = self.prev_visual_features with torch.no_grad(): step_batch = batch _, action, _, self.test_recurrent_hidden_states = self.actor_critic.act( batch, None, self.test_recurrent_hidden_states, self.prev_actions, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks.fill_(1.0) self.prev_actions.copy_(action) self.prev_visual_features = step_batch["visual_features"] # if self.final_action: # return 0 # if action.item() == 0: # self.final_action = True # return 1 return action.item()
def _collect_rollout_step(self, rollouts, current_episode_reward, running_episode_stats): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } ( values, actions, actions_log_probs, recurrent_hidden_states, ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations, device=self.device) rewards = torch.tensor(rewards, dtype=torch.float, device=current_episode_reward.device) rewards = rewards.unsqueeze(1) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=current_episode_reward.device, ) current_episode_reward += rewards running_episode_stats["reward"] += (1 - masks) * current_episode_reward running_episode_stats["count"] += 1 - masks for k, v in self._extract_scalars_from_infos(infos).items(): v = torch.tensor(v, dtype=torch.float, device=current_episode_reward.device).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"]) running_episode_stats[k] += (1 - masks) * v current_episode_reward *= masks if self._static_encoder: with torch.no_grad(): batch["prev_visual_features"] = step_observation[ "visual_features"] batch["visual_features"] = self._encoder(batch) rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ ckpt_dict = self.load_checkpoint(checkpoint_path, map_location=self.device) config = self._setup_eval_config(ckpt_dict["config"]) ppo_cfg = config.RL.PPO if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic # get name of performance metric, e.g. "spl" metric_name = self.config.TASK_CONFIG.TASK.MEASUREMENTS[0] metric_cfg = getattr(self.config.TASK_CONFIG.TASK, metric_name) measure_type = baseline_registry.get_measure(metric_cfg.TYPE) assert measure_type is not None, "invalid measurement type {}".format( metric_cfg.TYPE) self.metric_uuid = measure_type(None, None)._get_uuid() observations = self.envs.reset() batch = batch_obs(observations) for sensor in batch: batch[sensor] = batch[sensor].to(self.device) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [ [] ] * self.config.NUM_PROCESSES # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) while (len(stats_episodes) < self.config.TEST_EPISODE_COUNT and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): _, actions, _, test_recurrent_hidden_states = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations) for sensor in batch: batch[sensor] = batch[sensor].to(self.device) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not_done_masks[i].item() == 0: episode_stats = dict() episode_stats[self.metric_uuid] = infos[i][ self.metric_uuid] episode_stats["success"] = int( infos[i][self.metric_uuid] > 0) episode_stats["reward"] = current_episode_reward[i].item() current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, )] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metric_name=self.metric_uuid, metric_value=infos[i][self.metric_uuid], tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) rgb_frames[i].append(frame) # pausing self.envs with no new episode if len(envs_to_pause) > 0: state_index = list(range(self.envs.num_envs)) for idx in reversed(envs_to_pause): state_index.pop(idx) self.envs.pause_at(idx) # indexing along the batch dimensions test_recurrent_hidden_states = test_recurrent_hidden_states[ state_index] not_done_masks = not_done_masks[state_index] current_episode_reward = current_episode_reward[state_index] prev_actions = prev_actions[state_index] for k, v in batch.items(): batch[k] = v[state_index] if len(self.config.VIDEO_OPTION) > 0: rgb_frames = [rgb_frames[i] for i in state_index] aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = sum( [v[stat_key] for v in stats_episodes.values()]) num_episodes = len(stats_episodes) episode_reward_mean = aggregated_stats["reward"] / num_episodes episode_metric_mean = aggregated_stats[self.metric_uuid] / num_episodes episode_success_mean = aggregated_stats["success"] / num_episodes logger.info(f"Average episode reward: {episode_reward_mean:.6f}") logger.info(f"Average episode success: {episode_success_mean:.6f}") logger.info( f"Average episode {self.metric_uuid}: {episode_metric_mean:.6f}") writer.add_scalars( "eval_reward", {"average reward": episode_reward_mean}, checkpoint_index, ) writer.add_scalars( f"eval_{self.metric_uuid}", {f"average {self.metric_uuid}": episode_metric_mean}, checkpoint_index, ) writer.add_scalars( "eval_success", {"average success": episode_success_mean}, checkpoint_index, ) self.envs.close()
def train(self) -> None: r"""Main method for training PPO. Returns: None """ self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO self.device = torch.device("cuda", self.config.TORCH_GPU_ID) if not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()))) observations = self.envs.reset() batch = batch_obs(observations) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, self.envs.observation_spaces[0], self.envs.action_spaces[0], ppo_cfg.hidden_size, ) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) rollouts.to(self.device) episode_rewards = torch.zeros(self.envs.num_envs, 1) episode_counts = torch.zeros(self.envs.num_envs, 1) current_episode_reward = torch.zeros(self.envs.num_envs, 1) window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size) window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) with TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) as writer: for update in range(self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) for step in range(ppo_cfg.num_steps): delta_pth_time, delta_env_time, delta_steps = self._collect_rollout_step( rollouts, current_episode_reward, episode_rewards, episode_counts, ) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps delta_pth_time, value_loss, action_loss, dist_entropy = self._update_agent( ppo_cfg, rollouts) pth_time += delta_pth_time window_episode_reward.append(episode_rewards.clone()) window_episode_counts.append(episode_counts.clone()) losses = [value_loss, action_loss] stats = zip( ["count", "reward"], [window_episode_counts, window_episode_reward], ) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in stats } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar("reward", deltas["reward"] / deltas["count"], count_steps) writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) window_rewards = (window_episode_reward[-1] - window_episode_reward[0]).sum() window_counts = (window_episode_counts[-1] - window_episode_counts[0]).sum() if window_counts > 0: logger.info( "Average window size {} reward: {:3f}".format( len(window_episode_reward), (window_rewards / window_counts).item(), )) else: logger.info("No episodes finish in current window") # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint(f"ckpt.{count_checkpoints}.pth") count_checkpoints += 1 self.envs.close()
def train(self): # Get environments for training self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) if not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) #logger.info( # "agent number of parameters: {}".format( # sum(param.numel() for param in self.agent.parameters()) # ) #) # Change for the actual value cfg = self.config.RL.PPO rollouts = RolloutStorage( cfg.num_steps, self.envs.num_envs, self.envs.observation_spaces[0], self.envs.action_spaces[0], cfg.hidden_size, ) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations) for sensor in rollouts.observations: print(batch[sensor].shape) # Copy the information to the wrapper for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None episode_rewards = torch.zeros(self.envs.num_envs, 1) episode_counts = torch.zeros(self.envs.num_envs, 1) #current_episode_reward = torch.zeros(self.envs.num_envs, 1) #window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size) #window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) '''
def reset(self): obs = self._envs.reset() obs = batch_obs(obs) return obs
def _collect_rollout_step(self, rollouts, current_episode_reward, running_episode_stats): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): # get observations from the rollout not directly from envs # step_observation = { # k: v[rollouts.step] for k, v in rollouts.observations.items() # } step_observation = rollouts.get_memory_at(rollouts.step) (values, actions, actions_log_probs, recurrent_hidden_states) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() #scenes = [curr_ep.scene_id.split('/')[-2] for curr_ep in self.envs.current_episodes()] outputs = self.envs.step([a[0].item() for a in actions]) #self.envs.render('human') #scenes_ = [curr_ep.scene_id.split('/')[-2] for curr_ep in self.envs.current_episodes()] #if not (scenes == scenes_): # print(scenes_) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations, device=self.device) rewards = torch.tensor(rewards, dtype=torch.float, device=current_episode_reward.device) rewards = rewards.unsqueeze(1) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=current_episode_reward.device, ) current_episode_reward += rewards running_episode_stats["reward"] += (1 - masks) * current_episode_reward running_episode_stats["count"] += 1 - masks running_episode_stats['episode_num'] += masks for k, v in self._extract_scalars_from_infos(infos).items(): v = torch.tensor(v, dtype=torch.float, device=current_episode_reward.device).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"]) if k == 'length': running_episode_stats[k] += masks * v running_episode_stats[k] += (1 - masks) * v current_episode_reward *= masks if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch, actions, masks) for b in range(len(actions)): batch['visual_features'][b][-2] = infos[b]['episode'] batch['visual_features'][b][-1] = infos[b]['step'] rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def _eval_checkpoint_bruce( self, checkpoint_path: str, checkpoint_index: int = 0, ): r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: actor_critic batch not_done_masks test_recurrent_hidden_states """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") print(ckpt_dict) if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) return (self.actor_critic, batch, not_done_masks, test_recurrent_hidden_states)
def _update_dataset(self, data_it): if torch.cuda.is_available(): with torch.cuda.device(self.device): torch.cuda.empty_cache() if self.envs is None: self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, self.config.MODEL.STATE_ENCODER.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) observations = self.envs.reset() observations = transform_obs( observations, self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID) batch = batch_obs(observations, self.device) episodes = [[] for _ in range(self.envs.num_envs)] skips = [False for _ in range(self.envs.num_envs)] # Populate dones with False initially dones = [False for _ in range(self.envs.num_envs)] # https://arxiv.org/pdf/1011.0686.pdf # Theoretically, any beta function is fine so long as it converges to # zero as data_it -> inf. The paper suggests starting with beta = 1 and # exponential decay. if self.config.DAGGER.P == 0.0: # in Python 0.0 ** 0.0 == 1.0, but we want 0.0 beta = 0.0 else: beta = self.config.DAGGER.P**data_it ensure_unique_episodes = beta == 1.0 def hook_builder(tgt_tensor): def hook(m, i, o): tgt_tensor.set_(o.cpu()) return hook rgb_features = None rgb_hook = None if self.config.MODEL.RGB_ENCODER.cnn_type == "TorchVisionResNet50": rgb_features = torch.zeros((1, ), device="cpu") rgb_hook = self.actor_critic.net.rgb_encoder.layer_extract.register_forward_hook( hook_builder(rgb_features)) depth_features = None depth_hook = None if self.config.MODEL.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder": depth_features = torch.zeros((1, ), device="cpu") depth_hook = self.actor_critic.net.depth_encoder.visual_encoder.register_forward_hook( hook_builder(depth_features)) collected_eps = 0 if ensure_unique_episodes: ep_ids_collected = set( [ep.episode_id for ep in self.envs.current_episodes()]) with tqdm.tqdm( total=self.config.DAGGER.UPDATE_SIZE) as pbar, lmdb.open( self.lmdb_features_dir, map_size=int(self.config.DAGGER.LMDB_MAP_SIZE )) as lmdb_env, torch.no_grad(): start_id = lmdb_env.stat()["entries"] txn = lmdb_env.begin(write=True) while collected_eps < self.config.DAGGER.UPDATE_SIZE: if ensure_unique_episodes: envs_to_pause = [] current_episodes = self.envs.current_episodes() for i in range(self.envs.num_envs): if dones[i] and not skips[i]: ep = episodes[i] traj_obs = batch_obs([step[0] for step in ep], device=torch.device("cpu")) del traj_obs["vln_oracle_action_sensor"] for k, v in traj_obs.items(): traj_obs[k] = v.numpy() transposed_ep = [ traj_obs, np.array([step[1] for step in ep], dtype=np.int64), np.array([step[2] for step in ep], dtype=np.int64), ] txn.put( str(start_id + collected_eps).encode(), msgpack_numpy.packb(transposed_ep, use_bin_type=True), ) pbar.update() collected_eps += 1 if (collected_eps % self.config.DAGGER.LMDB_COMMIT_FREQUENCY) == 0: txn.commit() txn = lmdb_env.begin(write=True) if ensure_unique_episodes: if current_episodes[ i].episode_id in ep_ids_collected: envs_to_pause.append(i) else: ep_ids_collected.add( current_episodes[i].episode_id) if dones[i]: episodes[i] = [] if ensure_unique_episodes: ( self.envs, recurrent_hidden_states, not_done_masks, prev_actions, batch, ) = self._pause_envs( envs_to_pause, self.envs, recurrent_hidden_states, not_done_masks, prev_actions, batch, ) if self.envs.num_envs == 0: break (_, actions, _, recurrent_hidden_states) = self.actor_critic.act( batch, recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) # print("action: ", actions) # print("batch[vln_oracle_action_sensor]: ", batch["vln_oracle_action_sensor"]) # print("torch.rand_like(actions, dtype=torch.float) < beta: ", torch.rand_like(actions, dtype=torch.float) < beta) actions = torch.where( torch.rand_like(actions, dtype=torch.float) < beta, batch["vln_oracle_action_sensor"].long(), actions, ) for i in range(self.envs.num_envs): if rgb_features is not None: observations[i]["rgb_features"] = rgb_features[i] del observations[i]["rgb"] if depth_features is not None: observations[i]["depth_features"] = depth_features[i] del observations[i]["depth"] episodes[i].append(( observations[i], prev_actions[i].item(), batch["vln_oracle_action_sensor"][i].item(), )) skips = batch["vln_oracle_action_sensor"].long() == -1 actions = torch.where(skips, torch.zeros_like(actions), actions) skips = skips.squeeze(-1).to(device="cpu", non_blocking=True) prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, _ = [ list(x) for x in zip(*outputs) ] not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) observations = transform_obs( observations, self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID) batch = batch_obs(observations, self.device) txn.commit() self.envs.close() self.envs = None if rgb_hook is not None: rgb_hook.remove() if depth_hook is not None: depth_hook.remove()
def get_eval_map(env:ExpRLEnv, trainer:SemAntExpTrainer, M:int,depth_projection_net:DepthProjectionNet, step:int, objectName:str): """Given the environment and the configuration, compute the global top-down wall and seen area maps by sampling maps for individual locations along a uniform grid in the environment, and registering them. step (m): the length between two measure points """ # Initialize a global map for the episode scene_name = env.habitat_env.current_episode.scene_id.split('/')[-1].split('.')[0] dirPath = './data/debug/data/' + objectName + "/" + scene_name + '_' + env.habitat_env.current_episode.episode_id + '/' + "record/" safe_mkdir(dirPath) mapper = trainer.mapper config = trainer.config.TASK_CONFIG device = trainer.device scene = env.habitat_env.sim.semantic_scene obj_pos = {} obj_pos = {obj.id : [obj.aabb.center,obj.aabb.sizes] for obj in scene.objects if objectName == obj.category.name()} with open(dirPath + 'obj_pos.pkl', 'wb') as f: pickle.dump(obj_pos, f) objectIndexes = [int(key.split('_')[-1]) for key in obj_pos.keys()] global_wall_map = torch.zeros(1, 2, M, M).to(device) global_seen_map = torch.zeros(1, 2, M, M).to(device) global_object_map = torch.zeros(1, 2, M, M).to(device) global_se_map = torch.zeros(1, 2, M, M).to(device) global_se_filtered_map = torch.zeros(1, 2, M, M).to(device) global_se_seen_map = torch.zeros(1, 2, M, M).to(device) global_se_map_mit = torch.zeros(1, 2, M, M).to(device) global_se_map_gt = torch.zeros(1, 2, M, M).to(device) grid_size = config.TASK.GT_EGO_MAP.MAP_SCALE coordinate_max = maps.COORDINATE_MAX coordinate_min = maps.COORDINATE_MIN resolution = (coordinate_max - coordinate_min) / grid_size grid_resolution = (int(resolution), int(resolution)) top_down_map = maps.get_topdown_map( env.habitat_env.sim, grid_resolution, 20000, draw_border=False, ) map_w, map_h = top_down_map.shape intervals = (max(int(step / grid_size), 1), max(int(step / grid_size), 1)) x_vals = np.arange(0, map_w, intervals[0], dtype=int) y_vals = np.arange(0, map_h, intervals[1], dtype=int) coors = np.stack(np.meshgrid(x_vals, y_vals), axis=2) # (H, W, 2) coors = coors.reshape(-1, 2) # (H*W, 2) map_vals = top_down_map[coors[:, 0], coors[:, 1]] valid_coors = coors[map_vals > 0] real_x_vals = coordinate_max - valid_coors[:, 0] * grid_size real_z_vals = coordinate_min + valid_coors[:, 1] * grid_size start_y = env.habitat_env.sim.get_agent_state().position[1] index = 0 records = [] totalOffsetX = np.random.uniform(-1.5, 1.5) totalOffsetY = np.random.uniform(-1.5, 1.5) for j in tqdm.tqdm(range(real_x_vals.shape[0]),desc='occupacy map', position=2): for theta in np.arange(-np.pi, np.pi, np.pi ): index +=1 randomAngle = np.random.uniform(-np.pi,np.pi) randomX = np.random.uniform(-0.5, 0.5) randomY = np.random.uniform(-0.5, 0.5) position = [ real_x_vals[j].item() + randomX + totalOffsetX, start_y.item(), real_z_vals[j].item() + randomY + totalOffsetY, ] rotation = [ 0.0, np.sin((theta + randomAngle) / 2).item(), 0.0, np.cos((theta + randomAngle) / 2).item(), ] sim_obs = env.habitat_env.sim.get_observations_at( position, rotation, keep_agent_at_new_pose=True, ) episode = env.habitat_env.current_episode obs = obs, _, _, _ = env.step(action={'action':1}) batch = batch_obs([obs], device=trainer.device) semantic = batch["semantic"] mask = torch.zeros_like(semantic,dtype=bool) for objectIndex in objectIndexes: mask ^= (semantic == objectIndex) mask = rearrange(mask, "b h w -> b h w ()") batch["object_mask_gt"] = mask ego_map_gt_b = depth_projection_net( rearrange(batch["depth"], "b h w c -> b c h w") ) object_depth = batch["depth"]*(mask > 0) + (mask == 0)*100 ego_map_gt_object = depth_projection_net(rearrange(object_depth, "b h w c -> b c h w")) batch["ego_map_gt_object"] = ego_map_gt_object #begin use MIT Seg img_data = pil_to_tensor(rearrange(batch["rgb"], "b h w c -> b c h w")[0] / 255) singleton_batch = {'img_data': img_data[None]} output_size = img_data.shape[1:] with torch.no_grad(): scores = segmentation_module(singleton_batch, segSize=output_size) _, pred = torch.max(scores, dim=1) # visualize_result(pred) #end use MIT Seg batch["mit"] = pred #mask for chair if objectName == "chair": mask = (pred == 19)^(pred == 30)^(pred == 75) elif objectName == "bed": mask = (pred == 7) elif objectName == "table": mask = (pred == 15) else: mask = pred > 0 mask = rearrange(mask, "b h w -> b h w ()") batch["object_mask_mit"] = mask object_depth2 = batch["depth"]*(mask > 0) + (mask == 0)*100 ego_map_gt_object2 = depth_projection_net(rearrange(object_depth2, "b h w c -> b c h w")) batch["ego_map_mit_object"] = ego_map_gt_object2 ego_map_gt_anti = transpose_image(batch["ego_map_gt_anticipated"]).to(trainer.device) pu_inputs_t = { "rgb": process_image(batch["rgb"], trainer.mapper.img_mean_t, trainer.mapper.img_std_t).to(trainer.device), "depth": transpose_image(batch["depth"]).to(trainer.device), "ego_map_gt": transpose_image( rearrange(ego_map_gt_b, "b c h w -> b h w c") ).to(trainer.device), "ego_map_gt_anticipated": ego_map_gt_anti, } pu_inputs = trainer.mapper._safe_cat(pu_inputs_t, pu_inputs_t) torch.save(batch, dirPath + 'pu_inputs_t_'+str(index)+'.pt') pu_output = trainer.mapper.projection_unit(pu_inputs) se_map = asnumpy(pu_output['sem_estimate'])[0,:,:,:] ego_map_gt_b_np = asnumpy(ego_map_gt_b[0,...]) se_seen = ego_map_gt_b_np*se_map dilation_mask = np.ones((100, 100)) current_mask = cv2.dilate( se_seen.astype(np.float32), dilation_mask, iterations=1, ).astype(np.float32) se_filtered_map = se_map se_seen_map = torch.Tensor(se_seen).to(device) se_seen_map = rearrange(se_seen_map, "c h w -> () c h w") se_seen_map = (se_seen_map[:,0])[None,:] se_map = torch.Tensor(se_map).to(device) se_map = rearrange(se_map, "c h w -> () c h w") se_filtered_map = torch.Tensor(se_filtered_map).to(device) se_filtered_map = rearrange(se_filtered_map, "c h w -> () c h w") torch.save(se_filtered_map, dirPath + 'se_filtered_map_'+str(index)+'.pt') ego_map_gt = torch.Tensor(obs["ego_map_gt"]).to(device) ego_map_gt = rearrange(ego_map_gt, "h w c -> () c h w") ego_wall_map_gt = torch.Tensor(obs["ego_wall_map_gt"]).to(device) ego_wall_map_gt = rearrange(ego_wall_map_gt, "h w c -> () c h w") pose_gt = torch.Tensor(obs["pose_gt"]).unsqueeze(0).to(device) global_se_seen_map = mapper.ext_register_map( global_se_seen_map, se_seen_map, pose_gt ) global_se_map_gt = mapper.ext_register_map( global_se_map_gt, ego_map_gt_anti, pose_gt ) global_seen_map = mapper.ext_register_map( global_seen_map, ego_map_gt_b, pose_gt ) global_object_map = mapper.ext_register_map( global_object_map, ego_map_gt_object, pose_gt ) global_wall_map = mapper.ext_register_map( global_wall_map, ego_wall_map_gt, pose_gt ) global_se_map = mapper.ext_register_map( global_se_map, se_map, pose_gt ) global_se_filtered_map = mapper.ext_register_map( global_se_filtered_map, se_filtered_map, pose_gt ) global_se_map_mit = mapper.ext_register_map( global_se_map_mit, ego_map_gt_object2, pose_gt ) se_filtered_map = None batch = None se_map = None ego_map_gt = None ego_wall_map_gt = None obs = None pu_inputs_t = None pu_output=None _ = None pu_inputs=None gc.collect() torch.cuda.empty_cache() mask = dilate_tensor(global_seen_map*(1 - global_wall_map > 0)*global_se_map,(51, 51)) global_se_filtered_map = global_se_filtered_map*mask global_se_seen_map = global_se_seen_map*mask mask = dilate_tensor(global_seen_map*(1 - global_wall_map > 0)*global_object_map,(51, 51)) global_se_map_gt = global_se_map_gt*mask global_wall_map_np = asnumpy(rearrange(global_wall_map, "b c h w -> b h w c")[0]) global_seen_map_np = asnumpy(rearrange(global_seen_map, "b c h w -> b h w c")[0]) global_se_map_np = asnumpy(rearrange(global_se_map, "b c h w -> b h w c")[0]) global_se_global_se_filtered_map_np= asnumpy(rearrange(global_se_filtered_map, "b c h w -> b h w c")[0]) global_se_map_mit_np= asnumpy(rearrange(global_se_map_mit, "b c h w -> b h w c")[0]) global_se_map_gt_np= asnumpy(rearrange(global_se_map_gt, "b c h w -> b h w c")[0]) global_se_seen_map_np = asnumpy(rearrange(global_se_seen_map, "b c h w -> b h w c")[0]) global_object_map_np = asnumpy(rearrange(global_object_map, "b c h w -> b h w c")[0]) return global_seen_map_np, global_wall_map_np, global_se_map_np,global_se_global_se_filtered_map_np, global_se_map_mit_np,global_se_map_gt_np,global_se_seen_map_np,global_object_map_np
def train(self) -> None: r"""Main method for training PPO. Returns: None """ self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) if not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()))) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, self.envs.observation_spaces[0], self.envs.action_spaces[0], ppo_cfg.hidden_size, ) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) with TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) as writer: for update in range(self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time for k, v in running_episode_stats.items(): window_episode_stats[k].append(v.clone()) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar("reward", deltas["reward"] / deltas["count"], count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) losses = [value_loss, action_loss] writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join("{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint(f"ckpt.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 self.envs.close()
def train(self) -> None: r"""Main method for DD-PPO SLAM. Returns: None """ ##################################################################### ## init distrib and configuration ##################################################################### self.local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend ) # self.local_rank = 1 add_signal_handlers() # Stores the number of workers that have finished their rollout num_rollouts_done_store = distrib.PrefixStore( "rollout_tracker", tcp_store ) num_rollouts_done_store.set("num_done", "0") self.world_rank = distrib.get_rank() # server number self.world_size = distrib.get_world_size() self.config.defrost() self.config.TORCH_GPU_ID = self.local_rank # gpu number in one server self.config.SIMULATOR_GPU_ID = self.local_rank print("********************* TORCH_GPU_ID: ", self.config.TORCH_GPU_ID) print("********************* SIMULATOR_GPU_ID: ", self.config.SIMULATOR_GPU_ID) # Multiply by the number of simulators to make sure they also get unique seeds self.config.TASK_CONFIG.SEED += ( self.world_rank * self.config.NUM_PROCESSES ) self.config.freeze() random.seed(self.config.TASK_CONFIG.SEED) np.random.seed(self.config.TASK_CONFIG.SEED) torch.manual_seed(self.config.TASK_CONFIG.SEED) if torch.cuda.is_available(): self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") ##################################################################### ## build distrib NavSLAMRLEnv environment ##################################################################### print("#############################################################") print("## build distrib NavSLAMRLEnv environment") print("#############################################################") self.envs = construct_envs( self.config, get_env_class(self.config.ENV_NAME) ) observations = self.envs.reset() print("*************************** observations len:", len(observations)) # semantic process for i in range(len(observations)): observations[i]["semantic"] = observations[i]["semantic"].astype(np.int32) se = list(set(observations[i]["semantic"].ravel())) print(se) # print("*************************** observations type:", observations) # print("*************************** observations type:", observations[0]["map_sum"].shape) # 480*480*23 # print("*************************** observations curr_pose:", observations[0]["curr_pose"]) # [] batch = batch_obs(observations, device=self.device) print("*************************** batch len:", len(batch)) # print("*************************** batch:", batch) # print("************************************* current_episodes:", (self.envs.current_episodes())) ##################################################################### ## init actor_critic agent ##################################################################### print("#############################################################") print("## init actor_critic agent") print("#############################################################") self.map_w = observations[0]["map_sum"].shape[0] self.map_h = observations[0]["map_sum"].shape[1] # print("map_: ", observations[0]["curr_pose"].shape) ppo_cfg = self.config.RL.PPO if ( not os.path.isdir(self.config.CHECKPOINT_FOLDER) and self.world_rank == 0 ): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(observations, ppo_cfg) self.agent.init_distributed(find_unused_params=True) if self.world_rank == 0: logger.info( "agent number of trainable parameters: {}".format( sum( param.numel() for param in self.agent.parameters() if param.requires_grad ) ) ) ##################################################################### ## init Global Rollout Storage ##################################################################### print("#############################################################") print("## init Global Rollout Storage") print("#############################################################") self.num_each_global_step = self.config.RL.SLAMDDPPO.num_each_global_step rollouts = GlobalRolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, self.obs_space, self.g_action_space, ) rollouts.to(self.device) print('rollouts type:', type(rollouts)) print('--------------------------') # for k in rollouts.keys(): # print("rollouts: {0}".format(rollouts.observations)) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } _, actions, _, = self.actor_critic.act( step_observation, rollouts.prev_g_actions[0], rollouts.masks[0], ) self.global_goals = [[int(action[0].item() * self.map_w), int(action[1].item() * self.map_h)] for action in actions] # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None current_episode_reward = torch.zeros( self.envs.num_envs, 1, device=self.device ) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1, device=self.device), reward=torch.zeros(self.envs.num_envs, 1, device=self.device), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size) ) print("*************************** current_episode_reward:", current_episode_reward) print("*************************** running_episode_stats:", running_episode_stats) # print("*************************** window_episode_stats:", window_episode_stats) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 start_update = 0 prev_time = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) # interrupted_state = load_interrupted_state("/home/cirlab1/userdir/ybg/projects/habitat-api/data/interrup.pth") interrupted_state = load_interrupted_state() if interrupted_state is not None: self.agent.load_state_dict(interrupted_state["state_dict"]) self.agent.optimizer.load_state_dict( interrupted_state["optim_state"] ) lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) requeue_stats = interrupted_state["requeue_stats"] env_time = requeue_stats["env_time"] pth_time = requeue_stats["pth_time"] count_steps = requeue_stats["count_steps"] count_checkpoints = requeue_stats["count_checkpoints"] start_update = requeue_stats["start_update"] prev_time = requeue_stats["prev_time"] deif = {} with ( TensorboardWriter( self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs ) if self.world_rank == 0 else contextlib.suppress() ) as writer: for update in range(start_update, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES ) # print("************************************* current_episodes:", type(self.envs.count_episodes())) # print(EXIT.is_set()) if EXIT.is_set(): self.envs.close() if REQUEUE.is_set() and self.world_rank == 0: requeue_stats = dict( env_time=env_time, pth_time=pth_time, count_steps=count_steps, count_checkpoints=count_checkpoints, start_update=update, prev_time=(time.time() - t_start) + prev_time, ) save_interrupted_state( dict( state_dict=self.agent.state_dict(), optim_state=self.agent.optimizer.state_dict(), lr_sched_state=lr_scheduler.state_dict(), config=self.config, requeue_stats=requeue_stats, ), "/home/cirlab1/userdir/ybg/projects/habitat-api/data/interrup.pth" ) print("********************EXIT*********************") requeue_job() return count_steps_delta = 0 self.agent.eval() for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_global_rollout_step( rollouts, current_episode_reward, running_episode_stats ) pth_time += delta_pth_time env_time += delta_env_time count_steps_delta += delta_steps # print("************************************* current_episodes:") for i in range(len(self.envs.current_episodes())): # print(" ", self.envs.current_episodes()[i].episode_id," ", self.envs.current_episodes()[i].scene_id," ", self.envs.current_episodes()[i].object_category) if self.envs.current_episodes()[i].scene_id not in deif: deif[self.envs.current_episodes()[i].scene_id]=[int(self.envs.current_episodes()[i].episode_id)] else: deif[self.envs.current_episodes()[i].scene_id].append(int(self.envs.current_episodes()[i].episode_id)) # This is where the preemption of workers happens. If a # worker detects it will be a straggler, it preempts itself! if ( step >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD ) and int(num_rollouts_done_store.get("num_done")) > ( self.config.RL.DDPPO.sync_frac * self.world_size ): break num_rollouts_done_store.add("num_done", 1) self.agent.train() if self._static_encoder: self._encoder.eval() ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time stats_ordering = list(sorted(running_episode_stats.keys())) stats = torch.stack( [running_episode_stats[k] for k in stats_ordering], 0 ) distrib.all_reduce(stats) for i, k in enumerate(stats_ordering): window_episode_stats[k].append(stats[i].clone()) stats = torch.tensor( [value_loss, action_loss, count_steps_delta], device=self.device, ) distrib.all_reduce(stats) count_steps += stats[2].item() if self.world_rank == 0: num_rollouts_done_store.set("num_done", "0") losses = [ stats[0].item() / self.world_size, stats[1].item() / self.world_size, ] deltas = { k: ( (v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item() ) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "reward", deltas["reward"] / deltas["count"], count_steps, ) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tfps: {:.3f}\t".format( update, count_steps / ((time.time() - t_start) + prev_time), ) ) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format( update, env_time, pth_time, count_steps ) ) logger.info( "Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join( "{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count" ), ) ) # for k in deif: # deif[k] = list(set(deif[k])) # deif[k].sort() # print("deif: k", k, " : ", deif[k]) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"ckpt.{count_checkpoints}.pth", dict(step=count_steps), ) print('=' * 20 + 'Save Model' + '=' * 20) logger.info( "Save Model : {}".format(count_checkpoints) ) count_checkpoints += 1 self.envs.close()
def _collect_global_rollout_step( self, rollouts, current_episode_reward, running_episode_stats ): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # Run Global Policy (global_goals = Long-Term Goal) 16 if rollouts.step % (self.num_each_global_step) == 0: # print("rollouts.step: ", rollouts.step) with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } ( self.values, self.actions, self.actions_log_probs, ) = self.actor_critic.act( step_observation, rollouts.prev_g_actions[rollouts.step], rollouts.masks[rollouts.step], # deterministic=True, ) self.envs.update_full_map() self.global_goals = torch.Tensor( [[(action[0] * self.map_w), (action[1] * self.map_h)] for action in self.actions]) # print("global_goals: ", self.global_goals) # print("self.actions: ", self.actions) # print("actions_log_probs: ", self.actions_log_probs) # sample action l_actions = self.envs.get_local_actions(self.global_goals) # print("action: ", l_actions) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.envs.step(l_actions) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] # print("step:", rollouts.step, # "l_actions:", l_actions[0], # "global_goals:", self.global_goals[0], # "pose:", observations[0]["curr_pose"], # "object:", observations[0]["objectgoal"], # "dones:", dones[0]) # print("dones: ", dones) env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations, device=self.device) # assert (np.any(np.isnan(rewards)) == False), "Reward has nan." rewards = torch.tensor( rewards, dtype=torch.float, device=current_episode_reward.device ) rewards = rewards.unsqueeze(1) rewards = torch.where(torch.isnan(rewards), torch.full_like(rewards, 0), rewards) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=current_episode_reward.device, ) current_episode_reward += rewards running_episode_stats["reward"] += (1 - masks) * current_episode_reward running_episode_stats["count"] += 1 - masks for k, v in self._extract_scalars_from_infos(infos).items(): v = torch.tensor( v, dtype=torch.float, device=current_episode_reward.device ).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"] ) running_episode_stats[k] += (1 - masks) * v current_episode_reward *= masks rollouts.insert( batch, self.actions, self.actions_log_probs, self.values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def _collect_rollout_step( self, rollouts, current_episode_reward, episode_rewards, episode_counts ): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } # Collect detector features step_observation.pop("detector_features") ( values, actions, actions_log_probs, recurrent_hidden_states, aux_out ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) detections = step_observation["detector_features"] # Add to rollout memory to do inference with detector just once rollouts.observations["detector_features"][rollouts.step].copy_( detections) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations) rewards = torch.tensor(rewards, dtype=torch.float) rewards = rewards.unsqueeze(1) # Add Reward from detection pred_conf = detections[:, 4] pred_cls = detections[:, 5:] detect_class = self.detector_class_select[ batch["goalclass"].nonzero()[:, 1]] class_weight = pred_cls[detect_class] reward_class = class_weight.view(class_weight.size(0), -1).mean(dim=1) # rewards += reward_class.unsqueeze(1).cpu() # masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float ) current_episode_reward += rewards episode_rewards += (1 - masks) * current_episode_reward episode_counts += 1 - masks current_episode_reward *= masks rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def transform_callback(data): nonlocal actor_critic nonlocal batch nonlocal not_done_masks nonlocal test_recurrent_hidden_states global flag global t_prev_update global observation if flag == 2: observation["depth"] = np.reshape(data.data[0:-2], (256, 256, 1)) observation["pointgoal_with_gps_compass"] = data.data[-2:] flag = 1 return pointgoal_received = data.data[-2:] translate_amount = 0.25 # meters rotate_amount = 0.174533 # radians isrotated = (rotate_amount * 0.95 <= abs(pointgoal_received[1] - observation["pointgoal_with_gps_compass"][1]) <= rotate_amount * 1.05) istimeup = (time.time() - t_prev_update) >= 4 # print('istranslated is '+ str(istranslated)) # print('isrotated is '+ str(isrotated)) # print('istimeup is '+ str(istimeup)) if isrotated or istimeup: vel_msg = Twist() vel_msg.linear.x = 0 vel_msg.linear.y = 0 vel_msg.linear.z = 0 vel_msg.angular.x = 0 vel_msg.angular.y = 0 vel_msg.angular.z = 0 pub_vel.publish(vel_msg) time.sleep(0.2) print("entered update step") # cv2.imshow("Depth", observation['depth']) # cv2.waitKey(100) observation["depth"] = np.reshape(data.data[0:-2], (256, 256, 1)) observation["pointgoal_with_gps_compass"] = data.data[-2:] batch = batch_obs([observation]) for sensor in batch: batch[sensor] = batch[sensor].to(device) if flag == 1: not_done_masks = torch.tensor([0.0], dtype=torch.float, device=device) flag = 0 else: not_done_masks = torch.tensor([1.0], dtype=torch.float, device=device) _, actions, _, test_recurrent_hidden_states = actor_critic.act( batch, test_recurrent_hidden_states, not_done_masks, deterministic=True) action_id = actions.item() print("observation received to produce action_id is " + str(observation["pointgoal_with_gps_compass"])) print("action_id from net is " + str(actions.item())) t_prev_update = time.time() vel_msg = Twist() vel_msg.linear.x = 0 vel_msg.linear.y = 0 vel_msg.linear.z = 0 vel_msg.angular.x = 0 vel_msg.angular.y = 0 vel_msg.angular.z = 0 if action_id == 0: vel_msg.linear.x = 0.25 / 4 pub_vel.publish(vel_msg) elif action_id == 1: vel_msg.angular.z = 10 / 180 * 3.1415926 pub_vel.publish(vel_msg) elif action_id == 2: vel_msg.angular.z = -10 / 180 * 3.1415926 pub_vel.publish(vel_msg) else: pub_vel.publish(vel_msg) sub.unregister() print("NN finished navigation task")
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.actor_critic.eval() if self._static_encoder: self._encoder = self.agent.actor_critic.net.visual_encoder self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) if self._static_encoder: batch["visual_features"] = self._encoder(batch) batch["prev_visual_features"] = torch.zeros_like( batch["visual_features"]) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}.") logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps pbar = tqdm.tqdm(total=number_of_eval_episodes) self.actor_critic.eval() while (len(stats_episodes) < number_of_eval_episodes and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): step_batch = batch ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, device=self.device) if self._static_encoder: batch["prev_visual_features"] = step_batch["visual_features"] batch["visual_features"] = self._encoder(batch) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not_done_masks[i].item() == 0: pbar.update() episode_stats = dict() episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i])) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, )] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) rgb_frames[i].append(frame) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) self.envs.close()
def train(self, ckpt_path="", ckpt=-1, start_updates=0) -> None: r"""Main method for training PPO. Returns: None """ self.local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend) add_signal_handlers() # Stores the number of workers that have finished their rollout num_rollouts_done_store = distrib.PrefixStore("rollout_tracker", tcp_store) num_rollouts_done_store.set("num_done", "0") self.world_rank = distrib.get_rank() self.world_size = distrib.get_world_size() random.seed(self.config.TASK_CONFIG.SEED + self.world_rank) np.random.seed(self.config.TASK_CONFIG.SEED + self.world_rank) self.config.defrost() self.config.TORCH_GPU_ID = self.local_rank self.config.SIMULATOR_GPU_ID = self.local_rank self.config.freeze() if torch.cuda.is_available(): self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO task_cfg = self.config.TASK_CONFIG.TASK observation_space = self.envs.observation_spaces[0] aux_cfg = self.config.RL.AUX_TASKS init_aux_tasks, num_recurrent_memories, aux_task_strings = self._setup_auxiliary_tasks( aux_cfg, ppo_cfg, task_cfg, observation_space) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, observation_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_memories=num_recurrent_memories) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg, init_aux_tasks) self.agent.init_distributed(find_unused_params=True) if self.world_rank == 0: logger.info("agent number of trainable parameters: {}".format( sum(param.numel() for param in self.agent.parameters() if param.requires_grad))) current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), # including bonus ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 prev_time = 0 if ckpt != -1: logger.info( f"Resuming runs at checkpoint {ckpt}. Timing statistics are not tracked properly." ) assert ppo_cfg.use_linear_lr_decay is False and ppo_cfg.use_linear_clip_decay is False, "Resuming with decay not supported" # This is the checkpoint we start saving at count_checkpoints = ckpt + 1 count_steps = start_updates * ppo_cfg.num_steps * self.config.NUM_PROCESSES ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") self.agent.load_state_dict(ckpt_dict["state_dict"]) if "optim_state" in ckpt_dict: self.agent.optimizer.load_state_dict(ckpt_dict["optim_state"]) else: logger.warn("No optimizer state loaded, results may be funky") if "extra_state" in ckpt_dict and "step" in ckpt_dict[ "extra_state"]: count_steps = ckpt_dict["extra_state"]["step"] lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) interrupted_state = load_interrupted_state() if interrupted_state is not None: self.agent.load_state_dict(interrupted_state["state_dict"]) self.agent.optimizer.load_state_dict( interrupted_state["optim_state"]) lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) requeue_stats = interrupted_state["requeue_stats"] env_time = requeue_stats["env_time"] pth_time = requeue_stats["pth_time"] count_steps = requeue_stats["count_steps"] count_checkpoints = requeue_stats["count_checkpoints"] start_updates = requeue_stats["start_update"] prev_time = requeue_stats["prev_time"] with (TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) if self.world_rank == 0 else contextlib.suppress()) as writer: for update in range(start_updates, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) if EXIT.is_set(): self.envs.close() if REQUEUE.is_set() and self.world_rank == 0: requeue_stats = dict( env_time=env_time, pth_time=pth_time, count_steps=count_steps, count_checkpoints=count_checkpoints, start_update=update, prev_time=(time.time() - t_start) + prev_time, ) save_interrupted_state( dict( state_dict=self.agent.state_dict(), optim_state=self.agent.optimizer.state_dict(), lr_sched_state=lr_scheduler.state_dict(), config=self.config, requeue_stats=requeue_stats, )) requeue_job() return count_steps_delta = 0 self.agent.eval() for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps # This is where the preemption of workers happens. If a # worker detects it will be a straggler, it preempts itself! if (step >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD ) and int(num_rollouts_done_store.get("num_done")) > ( self.config.RL.DDPPO.sync_frac * self.world_size): break num_rollouts_done_store.add("num_done", 1) self.agent.train() ( delta_pth_time, value_loss, action_loss, dist_entropy, aux_task_losses, aux_dist_entropy, aux_weights, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time stats_ordering = list(sorted(running_episode_stats.keys())) stats = torch.stack( [running_episode_stats[k] for k in stats_ordering], 0).to(self.device) distrib.all_reduce(stats) for i, k in enumerate(stats_ordering): window_episode_stats[k].append(stats[i].clone()) stats = torch.tensor( [ dist_entropy, aux_dist_entropy, ] + [value_loss, action_loss] + aux_task_losses + [count_steps_delta], device=self.device, ) distrib.all_reduce(stats) if aux_weights is not None and len(aux_weights) > 0: distrib.all_reduce( torch.tensor(aux_weights, device=self.device)) count_steps += stats[-1].item() if self.world_rank == 0: num_rollouts_done_store.set("num_done", "0") avg_stats = [ stats[i].item() / self.world_size for i in range(len(stats) - 1) ] losses = avg_stats[2:] dist_entropy, aux_dist_entropy = avg_stats[:2] deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "reward", deltas["reward"] / deltas["count"], count_steps, ) writer.add_scalar( "entropy", dist_entropy, count_steps, ) writer.add_scalar("aux_entropy", aux_dist_entropy, count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) writer.add_scalars( "losses", { k: l for l, k in zip(losses, ["value", "policy"] + aux_task_strings) }, count_steps, ) writer.add_scalars( "aux_weights", {k: l for l, k in zip(aux_weights, aux_task_strings)}, count_steps, ) # Log stats formatted_aux_losses = [ "{:.3g}".format(l) for l in aux_task_losses ] if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tvalue_loss: {:.3g}\t action_loss: {:.3g}\taux_task_loss: {} \t aux_entropy {:.3g}\t" .format( update, value_loss, action_loss, formatted_aux_losses, aux_dist_entropy, )) logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / ((time.time() - t_start) + prev_time), )) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join( "{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"{self.checkpoint_prefix}.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 self.envs.close()
def _eval_checkpoint(self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0) -> None: r"""Evaluates a single checkpoint. Assumes episode IDs are unique. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ logger.info(f"checkpoint_path: {checkpoint_path}") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config( self.load_checkpoint(checkpoint_path, map_location="cpu")["config"]) else: config = self.config.clone() config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.TASK_CONFIG.TASK.NDTW.SPLIT = config.EVAL.SPLIT config.TASK_CONFIG.TASK.SDTW.SPLIT = config.EVAL.SPLIT config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = -1 if len(config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() # setup agent self.envs = construct_envs_auto_reset_false( config, get_env_class(config.ENV_NAME)) self.device = (torch.device("cuda", config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) self._setup_actor_critic_agent(config.MODEL, True, checkpoint_path) observations = self.envs.reset() observations = transform_obs( observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID) batch = batch_obs(observations, self.device) eval_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, config.NUM_PROCESSES, config.MODEL.STATE_ENCODER.hidden_size, device=self.device, ) prev_actions = torch.zeros(config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(config.NUM_PROCESSES, 1, device=self.device) stats_episodes = {} # dict of dicts that stores stats per episode if len(config.VIDEO_OPTION) > 0: os.makedirs(config.VIDEO_DIR, exist_ok=True) rgb_frames = [[] for _ in range(config.NUM_PROCESSES)] self.actor_critic.eval() while (self.envs.num_envs > 0 and len(stats_episodes) < config.EVAL.EPISODE_COUNT): current_episodes = self.envs.current_episodes() with torch.no_grad(): (_, actions, _, eval_recurrent_hidden_states) = self.actor_critic.act( batch, eval_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=True, ) # actions = batch["vln_oracle_action_sensor"].long() prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, _, dones, infos = [list(x) for x in zip(*outputs)] not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) # reset envs and observations if necessary for i in range(self.envs.num_envs): if len(config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) frame = append_text_to_image( frame, current_episodes[i].instruction.instruction_text) rgb_frames[i].append(frame) if not dones[i]: continue stats_episodes[current_episodes[i].episode_id] = infos[i] observations[i] = self.envs.reset_at(i)[0] prev_actions[i] = torch.zeros(1, dtype=torch.long) if len(config.VIDEO_OPTION) > 0: generate_video( video_option=config.VIDEO_OPTION, video_dir=config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics={ "SPL": round( stats_episodes[current_episodes[i].episode_id] ["spl"], 6) }, tb_writer=writer, ) del stats_episodes[ current_episodes[i].episode_id]["top_down_map"] del stats_episodes[ current_episodes[i].episode_id]["collisions"] rgb_frames[i] = [] observations = transform_obs( observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID) batch = batch_obs(observations, self.device) envs_to_pause = [] next_episodes = self.envs.current_episodes() for i in range(self.envs.num_envs): if next_episodes[i].episode_id in stats_episodes: envs_to_pause.append(i) ( self.envs, eval_recurrent_hidden_states, not_done_masks, prev_actions, batch, ) = self._pause_envs( envs_to_pause, self.envs, eval_recurrent_hidden_states, not_done_masks, prev_actions, batch, ) self.envs.close() aggregated_stats = {} num_episodes = len(stats_episodes) for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) split = config.TASK_CONFIG.DATASET.SPLIT with open(f"stats_ckpt_{checkpoint_index}_{split}.json", "w") as f: json.dump(aggregated_stats, f, indent=4) logger.info(f"Episodes evaluated: {num_episodes}") checkpoint_num = checkpoint_index + 1 for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.6f}") writer.add_scalar(f"eval_{split}_{k}", v, checkpoint_num)
def train(self, ckpt_path="", ckpt=-1, start_updates=0) -> None: r"""Main method for training PPO. Returns: None """ self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO task_cfg = self.config.TASK_CONFIG.TASK self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) # Initialize auxiliary tasks observation_space = self.envs.observation_spaces[0] aux_cfg = self.config.RL.AUX_TASKS init_aux_tasks, num_recurrent_memories, aux_task_strings = \ self._setup_auxiliary_tasks(aux_cfg, ppo_cfg, task_cfg, observation_space) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, observation_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_memories=num_recurrent_memories) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg, init_aux_tasks) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()))) current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 if ckpt != -1: logger.info( f"Resuming runs at checkpoint {ckpt}. Timing statistics are not tracked properly." ) assert ppo_cfg.use_linear_lr_decay is False and ppo_cfg.use_linear_clip_decay is False, "Resuming with decay not supported" # This is the checkpoint we start saving at count_checkpoints = ckpt + 1 count_steps = start_updates * ppo_cfg.num_steps * self.config.NUM_PROCESSES ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") self.agent.load_state_dict(ckpt_dict["state_dict"]) if "optim_state" in ckpt_dict: self.agent.optimizer.load_state_dict(ckpt_dict["optim_state"]) else: logger.warn("No optimizer state loaded, results may be funky") if "extra_state" in ckpt_dict and "step" in ckpt_dict[ "extra_state"]: count_steps = ckpt_dict["extra_state"]["step"] lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) with TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) as writer: for update in range(start_updates, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps delta_pth_time, value_loss, action_loss, dist_entropy, aux_task_losses, aux_dist_entropy, aux_weights = self._update_agent( ppo_cfg, rollouts) pth_time += delta_pth_time for k, v in running_episode_stats.items(): window_episode_stats[k].append(v.clone()) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "entropy", dist_entropy, count_steps, ) writer.add_scalar("aux_entropy", aux_dist_entropy, count_steps) writer.add_scalar("reward", deltas["reward"] / deltas["count"], count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) losses = [value_loss, action_loss] + aux_task_losses writer.add_scalars( "losses", { k: l for l, k in zip(losses, ["value", "policy"] + aux_task_strings) }, count_steps, ) writer.add_scalars( "aux_weights", {k: l for l, k in zip(aux_weights, aux_task_strings)}, count_steps, ) writer.add_scalar( "success", deltas["success"] / deltas["count"], count_steps, ) # Log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tvalue_loss: {}\t action_loss: {}\taux_task_loss: {} \t aux_entropy {}" .format(update, value_loss, action_loss, aux_task_losses, aux_dist_entropy)) logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join("{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"{self.checkpoint_prefix}.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 self.envs.close()
def train(self) -> None: r"""Main method for DD-PPO. Returns: None """ self.local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend) add_signal_handlers() # Stores the number of workers that have finished their rollout num_rollouts_done_store = distrib.PrefixStore("rollout_tracker", tcp_store) num_rollouts_done_store.set("num_done", "0") self.world_rank = distrib.get_rank() self.world_size = distrib.get_world_size() self.config.defrost() self.config.TORCH_GPU_ID = self.local_rank self.config.SIMULATOR_GPU_ID = self.local_rank # Multiply by the number of simulators to make sure they also get unique seeds self.config.TASK_CONFIG.SEED += (self.world_rank * self.config.NUM_PROCESSES) self.config.freeze() random.seed(self.config.TASK_CONFIG.SEED) np.random.seed(self.config.TASK_CONFIG.SEED) torch.manual_seed(self.config.TASK_CONFIG.SEED) if torch.cuda.is_available(): self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO if (not os.path.isdir(self.config.CHECKPOINT_FOLDER) and self.world_rank == 0): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) self.agent.init_distributed(find_unused_params=True) if self.world_rank == 0: logger.info("agent number of trainable parameters: {}".format( sum(param.numel() for param in self.agent.parameters() if param.requires_grad))) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) obs_space = self.envs.observation_spaces[0] if self._static_encoder: self._encoder = self.actor_critic.net.visual_encoder obs_space = SpaceDict({ "visual_features": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=self._encoder.output_shape, dtype=np.float32, ), **obs_space.spaces, }) with torch.no_grad(): batch["visual_features"] = self._encoder(batch) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, obs_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, ) rollouts.to(self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1, device=self.device), reward=torch.zeros(self.envs.num_envs, 1, device=self.device), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 start_update = 0 prev_time = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) interrupted_state = load_interrupted_state() if interrupted_state is not None: self.agent.load_state_dict(interrupted_state["state_dict"]) self.agent.optimizer.load_state_dict( interrupted_state["optim_state"]) lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) requeue_stats = interrupted_state["requeue_stats"] env_time = requeue_stats["env_time"] pth_time = requeue_stats["pth_time"] count_steps = requeue_stats["count_steps"] count_checkpoints = requeue_stats["count_checkpoints"] start_update = requeue_stats["start_update"] prev_time = requeue_stats["prev_time"] with (TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) if self.world_rank == 0 else contextlib.suppress()) as writer: for update in range(start_update, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) if EXIT.is_set(): self.envs.close() if REQUEUE.is_set() and self.world_rank == 0: requeue_stats = dict( env_time=env_time, pth_time=pth_time, count_steps=count_steps, count_checkpoints=count_checkpoints, start_update=update, prev_time=(time.time() - t_start) + prev_time, ) save_interrupted_state( dict( state_dict=self.agent.state_dict(), optim_state=self.agent.optimizer.state_dict(), lr_sched_state=lr_scheduler.state_dict(), config=self.config, requeue_stats=requeue_stats, )) requeue_job() return count_steps_delta = 0 self.agent.eval() for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps_delta += delta_steps # This is where the preemption of workers happens. If a # worker detects it will be a straggler, it preempts itself! if (step >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD ) and int(num_rollouts_done_store.get("num_done")) > ( self.config.RL.DDPPO.sync_frac * self.world_size): break num_rollouts_done_store.add("num_done", 1) self.agent.train() if self._static_encoder: self._encoder.eval() ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time stats_ordering = list(sorted(running_episode_stats.keys())) stats = torch.stack( [running_episode_stats[k] for k in stats_ordering], 0) distrib.all_reduce(stats) for i, k in enumerate(stats_ordering): window_episode_stats[k].append(stats[i].clone()) stats = torch.tensor( [value_loss, action_loss, count_steps_delta], device=self.device, ) distrib.all_reduce(stats) count_steps += stats[2].item() if self.world_rank == 0: num_rollouts_done_store.set("num_done", "0") losses = [ stats[0].item() / self.world_size, stats[1].item() / self.world_size, ] deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "reward", deltas["reward"] / deltas["count"], count_steps, ) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / ((time.time() - t_start) + prev_time), )) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join( "{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"ckpt.{count_checkpoints}.pth", dict(step=count_steps), ) count_checkpoints += 1 self.envs.close()
def _eval_checkpoint(self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, log_diagnostics=[], output_dir='.', label='.', num_eval_runs=1) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ if checkpoint_index == -1: ckpt_file = checkpoint_path.split('/')[-1] split_info = ckpt_file.split('.') checkpoint_index = split_info[1] # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO task_cfg = config.TASK_CONFIG.TASK config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) # pass in aux config if we're doing attention aux_cfg = self.config.RL.AUX_TASKS self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg) # Check if we accidentally recorded `visual_resnet` in our checkpoint and drop it (it's redundant with `visual_encoder`) ckpt_dict['state_dict'] = { k: v for k, v in ckpt_dict['state_dict'].items() if 'visual_resnet' not in k } self.agent.load_state_dict(ckpt_dict["state_dict"]) logger.info("agent number of trainable parameters: {}".format( sum(param.numel() for param in self.agent.parameters() if param.requires_grad))) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) _, num_recurrent_memories, _ = self._setup_auxiliary_tasks( aux_cfg, ppo_cfg, task_cfg, is_eval=True) if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES: aux_tasks = self.config.RL.AUX_TASKS.tasks num_recurrent_memories = len(self.config.RL.AUX_TASKS.tasks) test_recurrent_hidden_states = test_recurrent_hidden_states.unsqueeze( 2).repeat(1, 1, num_recurrent_memories, 1) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}.") logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps videos_cap = 2 # number of videos to generate per checkpoint if len(log_diagnostics) > 0: videos_cap = 10 # video_indices = random.sample(range(self.config.TEST_EPISODE_COUNT), # min(videos_cap, self.config.TEST_EPISODE_COUNT)) video_indices = range(10) print(f"Videos: {video_indices}") total_stats = [] dones_per_ep = dict() # Logging more extensive evaluation stats for analysis if len(log_diagnostics) > 0: d_stats = {} for d in log_diagnostics: d_stats[d] = [ [] for _ in range(self.config.NUM_PROCESSES) ] # stored as nested list envs x timesteps x k (# tasks) pbar = tqdm.tqdm(total=number_of_eval_episodes * num_eval_runs) self.agent.eval() while (len(stats_episodes) < number_of_eval_episodes * num_eval_runs and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): weights_output = None if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES: weights_output = torch.empty(self.envs.num_envs, len(aux_tasks)) ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act(batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, weights_output=weights_output) prev_actions.copy_(actions) for i in range(self.envs.num_envs): if Diagnostics.actions in log_diagnostics: d_stats[Diagnostics.actions][i].append( prev_actions[i].item()) if Diagnostics.weights in log_diagnostics: aux_weights = None if weights_output is None else weights_output[ i] if aux_weights is not None: d_stats[Diagnostics.weights][i].append( aux_weights.half().tolist()) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, device=self.device) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): next_k = ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) if dones_per_ep.get(next_k, 0) == num_eval_runs: envs_to_pause.append(i) # wait for the rest if not_done_masks[i].item() == 0: episode_stats = dict() episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i])) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats k = ( current_episodes[i].scene_id, current_episodes[i].episode_id, ) dones_per_ep[k] = dones_per_ep.get(k, 0) + 1 if dones_per_ep.get(k, 0) == 1 and len( self.config.VIDEO_OPTION) > 0 and len( stats_episodes) in video_indices: logger.info(f"Generating video {len(stats_episodes)}") category = getattr(current_episodes[i], "object_category", "") if category != "": category += "_" try: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info( infos[i]), tag=f"{category}{label}", tb_writer=writer, ) except Exception as e: logger.warning(str(e)) rgb_frames[i] = [] stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, dones_per_ep[k], )] = episode_stats if len(log_diagnostics) > 0: diagnostic_info = dict() for metric in log_diagnostics: diagnostic_info[metric] = d_stats[metric][i] d_stats[metric][i] = [] if Diagnostics.top_down_map in log_diagnostics: top_down_map = torch.tensor([]) if len(self.config.VIDEO_OPTION) > 0: top_down_map = infos[i]["top_down_map"]["map"] top_down_map = maps.colorize_topdown_map( top_down_map, fog_of_war_mask=None) diagnostic_info.update( dict(top_down_map=top_down_map)) total_stats.append( dict( stats=episode_stats, did_stop=bool(prev_actions[i] == 0), episode_info=attr.asdict(current_episodes[i]), info=diagnostic_info, )) pbar.update() # episode continues else: if len(self.config.VIDEO_OPTION) > 0: aux_weights = None if weights_output is None else weights_output[ i] frame = observations_to_image( observations[i], infos[i], current_episode_reward[i].item(), aux_weights, aux_tasks) rgb_frames[i].append(frame) if Diagnostics.gps in log_diagnostics: d_stats[Diagnostics.gps][i].append( observations[i]["gps"].tolist()) if Diagnostics.heading in log_diagnostics: d_stats[Diagnostics.heading][i].append( observations[i]["heading"].tolist()) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) logger.info("eval_metrics") logger.info(metrics) if len(log_diagnostics) > 0: os.makedirs(output_dir, exist_ok=True) eval_fn = f"{label}.json" with open(os.path.join(output_dir, eval_fn), 'w', encoding='utf-8') as f: json.dump(total_stats, f, ensure_ascii=False, indent=4) self.envs.close()