def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None: deserialized = json.loads(json_str) self.__dict__.update(deserialized) self.answer_vocab = VocabDict(word_list=self.answer_vocab["word_list"]) self.question_vocab = VocabDict( word_list=self.question_vocab["word_list"]) for ep_index, episode in enumerate(deserialized["episodes"]): episode = EQAEpisode(**episode) if scenes_dir is not None: if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): episode.scene_id = episode.scene_id[ len(DEFAULT_SCENE_PATH_PREFIX):] episode.scene_id = os.path.join(scenes_dir, episode.scene_id) episode.question = QuestionData(**episode.question) for g_index, goal in enumerate(episode.goals): episode.goals[g_index] = ObjectGoal(**goal) new_goal = episode.goals[g_index] if new_goal.view_points is not None: for p_index, agent_state in enumerate( new_goal.view_points): new_goal.view_points[p_index] = AgentState( **agent_state) if episode.shortest_paths is not None: for path in episode.shortest_paths: for p_index, point in enumerate(path): path[p_index] = ShortestPathPoint(**point) self.episodes[ep_index] = episode
def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None: deserialized = json.loads(json_str) default_rotation = [0, 0, 0, 1] self.train_vocab = VocabDict( word_list=deserialized["train_vocab"]["word_list"]) self.trainval_vocab = VocabDict( word_list=deserialized["trainval_vocab"]["word_list"]) self.action_tokens = deserialized["BERT_vocab"]["action_tokens"] self.mini_alignments = deserialized["mini_alignments"] self.scenes = deserialized["scenes"] self.connectivity = load_connectivity(self.config.CONNECTIVITY_PATH, self.scenes) for ep_index, r2r_episode in enumerate(deserialized["episodes"]): r2r_episode["curr_viewpoint"] = ViewpointData( image_id=r2r_episode["goals"][0], view_point=AgentState(position=r2r_episode["start_position"], rotation=r2r_episode["start_rotation"])) instruction_encoding = r2r_episode["instruction_encoding"] mask = r2r_episode["mask"] del r2r_episode["instruction_encoding"] del r2r_episode["mask"] episode = VLNEpisode(**r2r_episode) if scenes_dir is not None: if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): episode.scene_id = episode.scene_id[ len(DEFAULT_SCENE_PATH_PREFIX):] episode.scene_id = os.path.join(scenes_dir, episode.scene_id) episode.instruction = InstructionData( instruction=r2r_episode["instruction"], tokens=instruction_encoding, tokens_length=sum(mask), mask=mask) scan = episode.scan for v_index, viewpoint in enumerate(episode.goals): viewpoint_id = self.connectivity[scan]["idxtoid"][viewpoint] pos = self.connectivity[scan]["viewpoints"][viewpoint_id] rot = default_rotation episode.goals[v_index] = ViewpointData(image_id=viewpoint, view_point=AgentState( position=pos, rotation=rot)) episode.distance = self.get_distance_to_target( scan, episode.goals[0].image_id, episode.goals[-1].image_id) self.episodes.append(episode)
def _save_vqa_results( self, ckpt_idx: int, episode_ids: torch.Tensor, questions: torch.Tensor, images: torch.Tensor, pred_scores: torch.Tensor, gt_answers: torch.Tensor, q_vocab_dict: VocabDict, ans_vocab_dict: VocabDict, ) -> None: r"""For saving VQA results. Args: ckpt_idx: idx of checkpoint being evaluated episode_ids: episode ids of batch questions: input questions to model images: images' tensor containing input frames pred_scores: model prediction scores gt_answers: ground truth answers ground_truth: ground truth answer q_vocab_dict: Question VocabDict ans_vocab_dict: Answer VocabDict Returns: None """ episode_id = episode_ids[0].item() question = questions[0] images = images[0] gt_answer = gt_answers[0] scores = pred_scores[0] q_string = q_vocab_dict.token_idx_2_string(question) _, index = scores.max(0) pred_answer = sorted(ans_vocab_dict.word2idx_dict.keys())[index] gt_answer = sorted(ans_vocab_dict.word2idx_dict.keys())[gt_answer] logger.info("Question: {}".format(q_string)) logger.info("Predicted answer: {}".format(pred_answer)) logger.info("Ground-truth answer: {}".format(gt_answer)) result_path = self.config.RESULTS_DIR.format( split=self.config.TASK_CONFIG.DATASET.SPLIT ) result_path = os.path.join( result_path, "ckpt_{}_{}_image.jpg".format(ckpt_idx, episode_id) ) save_vqa_image_results( images, q_string, pred_answer, gt_answer, result_path )
def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None: deserialized = json.loads(json_str) self.instruction_vocab = VocabDict( word_list=deserialized["instruction_vocab"]["word_list"]) for episode in deserialized["episodes"]: episode = VLNEpisode(**episode) if scenes_dir is not None: if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): episode.scene_id = episode.scene_id[ len(DEFAULT_SCENE_PATH_PREFIX):] episode.scene_id = os.path.join(scenes_dir, episode.scene_id) episode.instruction = InstructionData(**episode.instruction) for g_index, goal in enumerate(episode.goals): episode.goals[g_index] = NavigationGoal(**goal) self.episodes.append(episode)