def __init__(self, observation_space, output_size, device, spatial_output: bool = False): super().__init__() self.device = device self.resnet_layer_size = 2048 linear_layer_input_size = 0 if "rgb" in observation_space.spaces: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] obs_size_0 = observation_space.spaces["rgb"].shape[0] obs_size_1 = observation_space.spaces["rgb"].shape[1] if obs_size_0 != 224 or obs_size_1 != 224: logger.warn( f"WARNING: RGBEncoderResnet50: observation size {obs_size} is not conformant to expected ResNet input size [3x224x224]" ) linear_layer_input_size += self.resnet_layer_size else: self._n_input_rgb = 0 if self.is_blind: self.cnn = nn.Sequential() return self.cnn = models.resnet50(pretrained=True) # disable gradients for resnet, params frozen for param in self.cnn.parameters(): param.requires_grad = False self.cnn.eval() self.spatial_output = spatial_output if not self.spatial_output: self.output_shape = (output_size, ) self.fc = nn.Linear(linear_layer_input_size, output_size) self.activation = nn.ReLU() else: class SpatialAvgPool(nn.Module): def forward(self, x): x = F.adaptive_avg_pool2d(x, (4, 4)) return x self.cnn.avgpool = SpatialAvgPool() self.cnn.fc = nn.Sequential() self.spatial_embeddings = nn.Embedding(4 * 4, 64) self.output_shape = ( self.resnet_layer_size + self.spatial_embeddings.embedding_dim, 4, 4, ) self.layer_extract = self.cnn._modules.get("avgpool")
def save_interrupted_state(state: Any, filename: str = None): r"""Saves the interrupted job state to the specified filename. This is useful when working with preemptable job partitions. This method will do nothing if SLURM is not currently being used and the filename is the default :param state: The state to save :param filename: The filename. Defaults to "${HOME}/.interrupted_states/${SLURM_JOBID}.pth" """ if SLURM_JOBID is None and filename is None: logger.warn("SLURM_JOBID is none, not saving interrupted state") return if filename is None: filename = INTERRUPTED_STATE_FILE torch.save(state, filename)
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ if self._is_distributed: raise RuntimeError("Evaluation does not support distributed mode") # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() if config.VERBOSE: logger.info(f"env config: {config}") self._init_envs(config) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) current_episode_reward = torch.zeros( self.envs.num_envs, 1, device="cpu" ) test_recurrent_hidden_states = torch.zeros( self.config.NUM_ENVIRONMENTS, self.actor_critic.net.num_recurrent_layers, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros( self.config.NUM_ENVIRONMENTS, 1, device=self.device, dtype=torch.long, ) not_done_masks = torch.zeros( self.config.NUM_ENVIRONMENTS, 1, device=self.device, dtype=torch.bool, ) stats_episodes: Dict[ Any, Any ] = {} # dict of dicts that stores stats per episode rgb_frames = [ [] for _ in range(self.config.NUM_ENVIRONMENTS) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps pbar = tqdm.tqdm(total=number_of_eval_episodes) self.actor_critic.eval() while ( len(stats_episodes) < number_of_eval_episodes and self.envs.num_envs > 0 ): current_episodes = self.envs.current_episodes() with torch.no_grad(): ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) # type: ignore # NB: Move actions to CPU. If CUDA tensors are # sent in to env.step(), that will create CUDA contexts # in the subprocesses. # For backwards compatibility, we also call .item() to convert to # an int step_data = [a.item() for a in actions.to(device="cpu")] outputs = self.envs.step(step_data) observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache, ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) not_done_masks = torch.tensor( [[not done] for done in dones], dtype=torch.bool, device="cpu", ) rewards = torch.tensor( rewards_l, dtype=torch.float, device="cpu" ).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not not_done_masks[i].item(): pbar.update() episode_stats = {} episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i]) ) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[ ( current_episodes[i].scene_id, current_episodes[i].episode_id, ) ] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: # TODO move normalization / channel changing out of the policy and undo it here frame = observations_to_image( {k: v[i] for k, v in batch.items()}, infos[i] ) rgb_frames[i].append(frame) not_done_masks = not_done_masks.to(device=self.device) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = {} for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum(v[stat_key] for v in stats_episodes.values()) / num_episodes ) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) self.envs.close()
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.actor_critic.eval() if self._static_encoder: self._encoder = self.agent.actor_critic.net.visual_encoder self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) if self._static_encoder: batch["visual_features"] = self._encoder(batch) batch["prev_visual_features"] = torch.zeros_like( batch["visual_features"]) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}.") logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps pbar = tqdm.tqdm(total=number_of_eval_episodes) self.actor_critic.eval() while (len(stats_episodes) < number_of_eval_episodes and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): step_batch = batch ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, device=self.device) if self._static_encoder: batch["prev_visual_features"] = step_batch["visual_features"] batch["visual_features"] = self._encoder(batch) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not_done_masks[i].item() == 0: pbar.update() episode_stats = dict() episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i])) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, )] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) rgb_frames[i].append(frame) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) self.envs.close()
def _eval_checkpoint(self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, log_diagnostics=[], output_dir='.', label='.', num_eval_runs=1) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ if checkpoint_index == -1: ckpt_file = checkpoint_path.split('/')[-1] split_info = ckpt_file.split('.') checkpoint_index = split_info[1] # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO task_cfg = config.TASK_CONFIG.TASK config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) # pass in aux config if we're doing attention aux_cfg = self.config.RL.AUX_TASKS self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg) # Check if we accidentally recorded `visual_resnet` in our checkpoint and drop it (it's redundant with `visual_encoder`) ckpt_dict['state_dict'] = { k: v for k, v in ckpt_dict['state_dict'].items() if 'visual_resnet' not in k } self.agent.load_state_dict(ckpt_dict["state_dict"]) logger.info("agent number of trainable parameters: {}".format( sum(param.numel() for param in self.agent.parameters() if param.requires_grad))) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) _, num_recurrent_memories, _ = self._setup_auxiliary_tasks( aux_cfg, ppo_cfg, task_cfg, is_eval=True) if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES: aux_tasks = self.config.RL.AUX_TASKS.tasks num_recurrent_memories = len(self.config.RL.AUX_TASKS.tasks) test_recurrent_hidden_states = test_recurrent_hidden_states.unsqueeze( 2).repeat(1, 1, num_recurrent_memories, 1) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}.") logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps videos_cap = 2 # number of videos to generate per checkpoint if len(log_diagnostics) > 0: videos_cap = 10 # video_indices = random.sample(range(self.config.TEST_EPISODE_COUNT), # min(videos_cap, self.config.TEST_EPISODE_COUNT)) video_indices = range(10) print(f"Videos: {video_indices}") total_stats = [] dones_per_ep = dict() # Logging more extensive evaluation stats for analysis if len(log_diagnostics) > 0: d_stats = {} for d in log_diagnostics: d_stats[d] = [ [] for _ in range(self.config.NUM_PROCESSES) ] # stored as nested list envs x timesteps x k (# tasks) pbar = tqdm.tqdm(total=number_of_eval_episodes * num_eval_runs) self.agent.eval() while (len(stats_episodes) < number_of_eval_episodes * num_eval_runs and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): weights_output = None if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES: weights_output = torch.empty(self.envs.num_envs, len(aux_tasks)) ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act(batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, weights_output=weights_output) prev_actions.copy_(actions) for i in range(self.envs.num_envs): if Diagnostics.actions in log_diagnostics: d_stats[Diagnostics.actions][i].append( prev_actions[i].item()) if Diagnostics.weights in log_diagnostics: aux_weights = None if weights_output is None else weights_output[ i] if aux_weights is not None: d_stats[Diagnostics.weights][i].append( aux_weights.half().tolist()) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, device=self.device) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): next_k = ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) if dones_per_ep.get(next_k, 0) == num_eval_runs: envs_to_pause.append(i) # wait for the rest if not_done_masks[i].item() == 0: episode_stats = dict() episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i])) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats k = ( current_episodes[i].scene_id, current_episodes[i].episode_id, ) dones_per_ep[k] = dones_per_ep.get(k, 0) + 1 if dones_per_ep.get(k, 0) == 1 and len( self.config.VIDEO_OPTION) > 0 and len( stats_episodes) in video_indices: logger.info(f"Generating video {len(stats_episodes)}") category = getattr(current_episodes[i], "object_category", "") if category != "": category += "_" try: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info( infos[i]), tag=f"{category}{label}", tb_writer=writer, ) except Exception as e: logger.warning(str(e)) rgb_frames[i] = [] stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, dones_per_ep[k], )] = episode_stats if len(log_diagnostics) > 0: diagnostic_info = dict() for metric in log_diagnostics: diagnostic_info[metric] = d_stats[metric][i] d_stats[metric][i] = [] if Diagnostics.top_down_map in log_diagnostics: top_down_map = torch.tensor([]) if len(self.config.VIDEO_OPTION) > 0: top_down_map = infos[i]["top_down_map"]["map"] top_down_map = maps.colorize_topdown_map( top_down_map, fog_of_war_mask=None) diagnostic_info.update( dict(top_down_map=top_down_map)) total_stats.append( dict( stats=episode_stats, did_stop=bool(prev_actions[i] == 0), episode_info=attr.asdict(current_episodes[i]), info=diagnostic_info, )) pbar.update() # episode continues else: if len(self.config.VIDEO_OPTION) > 0: aux_weights = None if weights_output is None else weights_output[ i] frame = observations_to_image( observations[i], infos[i], current_episode_reward[i].item(), aux_weights, aux_tasks) rgb_frames[i].append(frame) if Diagnostics.gps in log_diagnostics: d_stats[Diagnostics.gps][i].append( observations[i]["gps"].tolist()) if Diagnostics.heading in log_diagnostics: d_stats[Diagnostics.heading][i].append( observations[i]["heading"].tolist()) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) logger.info("eval_metrics") logger.info(metrics) if len(log_diagnostics) > 0: os.makedirs(output_dir, exist_ok=True) eval_fn = f"{label}.json" with open(os.path.join(output_dir, eval_fn), 'w', encoding='utf-8') as f: json.dump(total_stats, f, ensure_ascii=False, indent=4) self.envs.close()
def train(self, ckpt_path="", ckpt=-1, start_updates=0) -> None: r"""Main method for training PPO. Returns: None """ self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO task_cfg = self.config.TASK_CONFIG.TASK self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) # Initialize auxiliary tasks observation_space = self.envs.observation_spaces[0] aux_cfg = self.config.RL.AUX_TASKS init_aux_tasks, num_recurrent_memories, aux_task_strings = \ self._setup_auxiliary_tasks(aux_cfg, ppo_cfg, task_cfg, observation_space) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, observation_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_memories=num_recurrent_memories) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg, init_aux_tasks) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()))) current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 if ckpt != -1: logger.info( f"Resuming runs at checkpoint {ckpt}. Timing statistics are not tracked properly." ) assert ppo_cfg.use_linear_lr_decay is False and ppo_cfg.use_linear_clip_decay is False, "Resuming with decay not supported" # This is the checkpoint we start saving at count_checkpoints = ckpt + 1 count_steps = start_updates * ppo_cfg.num_steps * self.config.NUM_PROCESSES ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") self.agent.load_state_dict(ckpt_dict["state_dict"]) if "optim_state" in ckpt_dict: self.agent.optimizer.load_state_dict(ckpt_dict["optim_state"]) else: logger.warn("No optimizer state loaded, results may be funky") if "extra_state" in ckpt_dict and "step" in ckpt_dict[ "extra_state"]: count_steps = ckpt_dict["extra_state"]["step"] lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) with TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) as writer: for update in range(start_updates, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps delta_pth_time, value_loss, action_loss, dist_entropy, aux_task_losses, aux_dist_entropy, aux_weights = self._update_agent( ppo_cfg, rollouts) pth_time += delta_pth_time for k, v in running_episode_stats.items(): window_episode_stats[k].append(v.clone()) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "entropy", dist_entropy, count_steps, ) writer.add_scalar("aux_entropy", aux_dist_entropy, count_steps) writer.add_scalar("reward", deltas["reward"] / deltas["count"], count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) losses = [value_loss, action_loss] + aux_task_losses writer.add_scalars( "losses", { k: l for l, k in zip(losses, ["value", "policy"] + aux_task_strings) }, count_steps, ) writer.add_scalars( "aux_weights", {k: l for l, k in zip(aux_weights, aux_task_strings)}, count_steps, ) writer.add_scalar( "success", deltas["success"] / deltas["count"], count_steps, ) # Log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tvalue_loss: {}\t action_loss: {}\taux_task_loss: {} \t aux_entropy {}" .format(update, value_loss, action_loss, aux_task_losses, aux_dist_entropy)) logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join("{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"{self.checkpoint_prefix}.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 self.envs.close()
def train(self, ckpt_path="", ckpt=-1, start_updates=0) -> None: r"""Main method for training PPO. Returns: None """ self.local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend) add_signal_handlers() # Stores the number of workers that have finished their rollout num_rollouts_done_store = distrib.PrefixStore("rollout_tracker", tcp_store) num_rollouts_done_store.set("num_done", "0") self.world_rank = distrib.get_rank() self.world_size = distrib.get_world_size() random.seed(self.config.TASK_CONFIG.SEED + self.world_rank) np.random.seed(self.config.TASK_CONFIG.SEED + self.world_rank) self.config.defrost() self.config.TORCH_GPU_ID = self.local_rank self.config.SIMULATOR_GPU_ID = self.local_rank self.config.freeze() if torch.cuda.is_available(): self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO task_cfg = self.config.TASK_CONFIG.TASK observation_space = self.envs.observation_spaces[0] aux_cfg = self.config.RL.AUX_TASKS init_aux_tasks, num_recurrent_memories, aux_task_strings = self._setup_auxiliary_tasks( aux_cfg, ppo_cfg, task_cfg, observation_space) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, observation_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_memories=num_recurrent_memories) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg, init_aux_tasks) self.agent.init_distributed(find_unused_params=True) if self.world_rank == 0: logger.info("agent number of trainable parameters: {}".format( sum(param.numel() for param in self.agent.parameters() if param.requires_grad))) current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), # including bonus ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 prev_time = 0 if ckpt != -1: logger.info( f"Resuming runs at checkpoint {ckpt}. Timing statistics are not tracked properly." ) assert ppo_cfg.use_linear_lr_decay is False and ppo_cfg.use_linear_clip_decay is False, "Resuming with decay not supported" # This is the checkpoint we start saving at count_checkpoints = ckpt + 1 count_steps = start_updates * ppo_cfg.num_steps * self.config.NUM_PROCESSES ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") self.agent.load_state_dict(ckpt_dict["state_dict"]) if "optim_state" in ckpt_dict: self.agent.optimizer.load_state_dict(ckpt_dict["optim_state"]) else: logger.warn("No optimizer state loaded, results may be funky") if "extra_state" in ckpt_dict and "step" in ckpt_dict[ "extra_state"]: count_steps = ckpt_dict["extra_state"]["step"] lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) interrupted_state = load_interrupted_state() if interrupted_state is not None: self.agent.load_state_dict(interrupted_state["state_dict"]) self.agent.optimizer.load_state_dict( interrupted_state["optim_state"]) lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) requeue_stats = interrupted_state["requeue_stats"] env_time = requeue_stats["env_time"] pth_time = requeue_stats["pth_time"] count_steps = requeue_stats["count_steps"] count_checkpoints = requeue_stats["count_checkpoints"] start_updates = requeue_stats["start_update"] prev_time = requeue_stats["prev_time"] with (TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) if self.world_rank == 0 else contextlib.suppress()) as writer: for update in range(start_updates, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) if EXIT.is_set(): self.envs.close() if REQUEUE.is_set() and self.world_rank == 0: requeue_stats = dict( env_time=env_time, pth_time=pth_time, count_steps=count_steps, count_checkpoints=count_checkpoints, start_update=update, prev_time=(time.time() - t_start) + prev_time, ) save_interrupted_state( dict( state_dict=self.agent.state_dict(), optim_state=self.agent.optimizer.state_dict(), lr_sched_state=lr_scheduler.state_dict(), config=self.config, requeue_stats=requeue_stats, )) requeue_job() return count_steps_delta = 0 self.agent.eval() for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps # This is where the preemption of workers happens. If a # worker detects it will be a straggler, it preempts itself! if (step >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD ) and int(num_rollouts_done_store.get("num_done")) > ( self.config.RL.DDPPO.sync_frac * self.world_size): break num_rollouts_done_store.add("num_done", 1) self.agent.train() ( delta_pth_time, value_loss, action_loss, dist_entropy, aux_task_losses, aux_dist_entropy, aux_weights, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time stats_ordering = list(sorted(running_episode_stats.keys())) stats = torch.stack( [running_episode_stats[k] for k in stats_ordering], 0).to(self.device) distrib.all_reduce(stats) for i, k in enumerate(stats_ordering): window_episode_stats[k].append(stats[i].clone()) stats = torch.tensor( [ dist_entropy, aux_dist_entropy, ] + [value_loss, action_loss] + aux_task_losses + [count_steps_delta], device=self.device, ) distrib.all_reduce(stats) if aux_weights is not None and len(aux_weights) > 0: distrib.all_reduce( torch.tensor(aux_weights, device=self.device)) count_steps += stats[-1].item() if self.world_rank == 0: num_rollouts_done_store.set("num_done", "0") avg_stats = [ stats[i].item() / self.world_size for i in range(len(stats) - 1) ] losses = avg_stats[2:] dist_entropy, aux_dist_entropy = avg_stats[:2] deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "reward", deltas["reward"] / deltas["count"], count_steps, ) writer.add_scalar( "entropy", dist_entropy, count_steps, ) writer.add_scalar("aux_entropy", aux_dist_entropy, count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) writer.add_scalars( "losses", { k: l for l, k in zip(losses, ["value", "policy"] + aux_task_strings) }, count_steps, ) writer.add_scalars( "aux_weights", {k: l for l, k in zip(aux_weights, aux_task_strings)}, count_steps, ) # Log stats formatted_aux_losses = [ "{:.3g}".format(l) for l in aux_task_losses ] if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tvalue_loss: {:.3g}\t action_loss: {:.3g}\taux_task_loss: {} \t aux_entropy {:.3g}\t" .format( update, value_loss, action_loss, formatted_aux_losses, aux_dist_entropy, )) logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / ((time.time() - t_start) + prev_time), )) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join( "{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"{self.checkpoint_prefix}.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 self.envs.close()
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO ans_cfg = config.RL.ANS config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg, ans_cfg) # Convert the state_dict of mapper_agent to mapper mapper_dict = { k.replace("mapper.", ""): v for k, v in ckpt_dict["mapper_state_dict"].items() } # Converting the state_dict of local_agent to just the local_policy. local_dict = { k.replace("actor_critic.", ""): v for k, v in ckpt_dict["local_state_dict"].items() } # Strict = False is set to ignore to handle the case where # pose_estimator is not required. self.mapper.load_state_dict(mapper_dict, strict=False) self.local_actor_critic.load_state_dict(local_dict) # Set models to evaluation self.mapper.eval() self.local_actor_critic.eval() number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps M = ans_cfg.overall_map_size V = ans_cfg.MAPPER.map_size s = ans_cfg.MAPPER.map_scale imH, imW = ans_cfg.image_scale_hw assert ( self.envs.num_envs == 1 ), "Number of environments needs to be 1 for evaluation" # Define metric accumulators # Navigation metrics navigation_metrics = { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), } per_difficulty_navigation_metrics = { "easy": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, "medium": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, "hard": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, } times_per_episode = deque() times_per_step = deque() # Define a simple function to return episode difficulty based on # the geodesic distance def classify_difficulty(gd): if gd < 5.0: return "easy" elif gd < 10.0: return "medium" else: return "hard" eval_start_time = time.time() # Reset environments only for the very first batch observations = self.envs.reset() for ep in range(number_of_eval_episodes): # ============================== Reset agent ============================== # Reset agent states state_estimates = { "pose_estimates": torch.zeros(self.envs.num_envs, 3).to(self.device), "map_states": torch.zeros(self.envs.num_envs, 2, M, M).to(self.device), "recurrent_hidden_states": torch.zeros( 1, self.envs.num_envs, ans_cfg.LOCAL_POLICY.hidden_size ).to(self.device), } # Reset ANS states self.ans_net.reset() self.not_done_masks = torch.zeros(self.envs.num_envs, 1, device=self.device) self.prev_actions = torch.zeros(self.envs.num_envs, 1, device=self.device) self.prev_batch = None self.ep_time = torch.zeros(self.envs.num_envs, 1, device=self.device) # =========================== Episode loop ================================ ep_start_time = time.time() current_episodes = self.envs.current_episodes() for ep_step in range(self.config.T_MAX): step_start_time = time.time() # ============================ Action step ============================ batch = self._prepare_batch(observations) if self.prev_batch is None: self.prev_batch = copy.deepcopy(batch) prev_pose_estimates = state_estimates["pose_estimates"] with torch.no_grad(): ( _, _, mapper_outputs, local_policy_outputs, state_estimates, ) = self.ans_net.act( batch, self.prev_batch, state_estimates, self.ep_time, self.not_done_masks, deterministic=ans_cfg.LOCAL_POLICY.deterministic_flag, ) actions = local_policy_outputs["actions"] # Make masks not done till reset (end of episode) self.not_done_masks = torch.ones( self.envs.num_envs, 1, device=self.device ) self.prev_actions.copy_(actions) if ep_step == 0: state_estimates["pose_estimates"].copy_(prev_pose_estimates) self.ep_time += 1 # Update prev batch for k, v in batch.items(): self.prev_batch[k].copy_(v) # Remap actions from exploration to navigation agent. actions_rmp = self._remap_actions(actions) # =========================== Environment step ======================== outputs = self.envs.step([a[0].item() for a in actions_rmp]) observations, _, dones, infos = [list(x) for x in zip(*outputs)] times_per_step.append(time.time() - step_start_time) # ============================ Process metrics ======================== if dones[0]: times_per_episode.append(time.time() - ep_start_time) mins_per_episode = np.mean(times_per_episode).item() / 60.0 eta_completion = mins_per_episode * ( number_of_eval_episodes - ep - 1 ) secs_per_step = np.mean(times_per_step).item() for i in range(self.envs.num_envs): episode_id = int(current_episodes[i].episode_id) curr_metrics = { "spl": infos[i]["spl"], "softspl": infos[i]["softspl"], "success_rate": infos[i]["success"], "time": ep_step + 1, "distance_to_goal": infos[i]["distance_to_goal"], } # Estimate difficulty of episode episode_difficulty = classify_difficulty( current_episodes[i].info["geodesic_distance"] ) for k, v in curr_metrics.items(): navigation_metrics[k].update(v, 1.0) per_difficulty_navigation_metrics[episode_difficulty][ k ].update(v, 1.0) logger.info(f"====> {ep}/{number_of_eval_episodes} done") for k, v in curr_metrics.items(): logger.info(f"{k:25s} : {v:10.3f}") logger.info("{:25s} : {:10d}".format("episode_id", episode_id)) logger.info(f"Time per episode: {mins_per_episode:.3f} mins") logger.info(f"Time per step: {secs_per_step:.3f} secs") logger.info(f"ETA: {eta_completion:.3f} mins") # For navigation, terminate episode loop when dones is called break # done-for if checkpoint_index == 0: try: eval_ckpt_idx = self.config.EVAL_CKPT_PATH_DIR.split("/")[-1].split( "." )[1] logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_final_{eval_ckpt_idx}.txt" ) except: logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_{checkpoint_index}.txt" ) else: logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_{checkpoint_index}.txt" ) logger.info( f"======= Evaluating over {number_of_eval_episodes} episodes =============" ) logger.info(f"=======> Navigation metrics") for k, v in navigation_metrics.items(): logger.info(f"{k}: {v.get_metric():.3f}") writer.add_scalar(f"navigation/{k}", v.get_metric(), checkpoint_index) for diff, diff_metrics in per_difficulty_navigation_metrics.items(): logger.info(f"=============== {diff:^10s} metrics ==============") for k, v in diff_metrics.items(): logger.info(f"{k}: {v.get_metric():.3f}") writer.add_scalar( f"{diff}_navigation/{k}", v.get_metric(), checkpoint_index ) total_eval_time = (time.time() - eval_start_time) / 60.0 logger.info(f"Total evaluation time: {total_eval_time:.3f} mins") self.envs.close()
def train(self, ckpt_path="", ckpt=-1, start_updates=0) -> None: r"""Main method for training PPO. Returns: None """ self.envs = construct_envs( self.config, get_env_class(self.config.ENV_NAME) ) observation_space = self.envs.observation_spaces[0] ppo_cfg = self.config.RL.PPO task_cfg = self.config.TASK_CONFIG.TASK aux_cfg = self.config.RL.AUX_TASKS self.device = ( torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu") ) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None self._setup_dqn_agent(ppo_cfg, task_cfg, aux_cfg, []) self.dataset = RolloutDataset() self.dataloader = DataLoader(self.dataset, batch_size=16, num_workers=0) # Use environment to initialize the metadata for training the model self.envs.close() if self.config.RESUME_CURIOUS: weights = torch.load(self.config.RESUME_CURIOUS)['state_dict'] state_dict = self.q_network.state_dict() weights_new = {} for k, v in weights.items(): if "model_encoder" in k: k = k.replace("model_encoder", "visual_resnet").replace("actor_critic.", "") if k in state_dict: weights_new[k] = v state_dict.update(weights_new) self.q_network.load_state_dict(state_dict) logger.info( "agent number of parameters: {}".format( sum(param.numel() for param in self.q_network.parameters()) ) ) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 if ckpt != -1: logger.info( f"Resuming runs at checkpoint {ckpt}. Timing statistics are not tracked properly." ) assert ppo_cfg.use_linear_lr_decay is False and ppo_cfg.use_linear_clip_decay is False, "Resuming with decay not supported" # This is the checkpoint we start saving at count_checkpoints = ckpt + 1 count_steps = start_updates * ppo_cfg.num_steps * self.config.NUM_PROCESSES ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") self.q_network.load_state_dict(ckpt_dict["state_dict"]) self.q_network_target.load_state_dict(ckpt_dict["target_state_dict"]) if "optim_state" in ckpt_dict: self.agent.optimizer.load_state_dict(ckpt_dict["optim_state"]) else: logger.warn("No optimizer state loaded, results may be funky") if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: count_steps = ckpt_dict["extra_state"]["step"] lr_scheduler = LambdaLR( optimizer=self.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) im_size = 256 with TensorboardWriter( self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs ) as writer: update = 0 for i in range(self.config.NUM_EPOCHS): for im, pointgoal, action, mask, reward in self.dataloader: if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() im, pointgoal, action, mask, reward = collate(im), collate(pointgoal), collate(action), collate(mask), collate(reward) im = im.to(self.device).float() pointgoal = pointgoal.to(self.device).float() mask = mask.to(self.device).float() reward = reward.to(self.device).float() action = action.to(self.device).long() nstep = im.size(1) hidden_states = None hidden_states_target = None # q_vals = [] # q_vals_target = [] step = random.randint(0, nstep-1) output = self.q_network({'rgb': im[:, step]}, None, None) mse_loss = torch.pow(output - im[:, step] / 255., 2).mean() mse_loss.backward() # for step in range(nstep): # q_val, hidden_states = self.q_network({'rgb': im[:, step], 'pointgoal_with_gps_compass': pointgoal[:, step]}, hidden_states, mask[:, step]) # q_val_target, hidden_states_target = self.q_network_target({'rgb': im[:, step], 'pointgoal_with_gps_compass': pointgoal[:, step]}, hidden_states_target, mask[:, step]) # q_vals.append(q_val) # q_vals_target.append(q_val_target) # q_vals = torch.stack(q_vals, dim=1) # q_vals_target = torch.stack(q_vals_target, dim=1) # a_select = torch.argmax(q_vals, dim=-1, keepdim=True) # target_select = torch.gather(q_vals_target, -1, a_select) # target = reward + ppo_cfg.gamma * target_select[:, 1:] * mask[:, 1:] # target = target.detach() # pred_q = torch.gather(q_vals[:, :-1], -1, action) # mse_loss = torch.pow(pred_q - target, 2).mean() # mse_loss.backward() # grad_norm = torch.nn.utils.clip_grad_norm(self.q_network.parameters(), 80) self.optimizer.step() self.optimizer.zero_grad() writer.add_scalar( "loss", mse_loss, update, ) # writer.add_scalar( # "q_val", # q_vals.max(), # update, # ) if update % 10 == 0: print("Update: {}, loss: {}".format(update, mse_loss)) if update % 100 == 0: self.sync_model() # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"{self.checkpoint_prefix}.{count_checkpoints}.pth", dict(step=count_steps) ) count_checkpoints += 1 update = update + 1