class DrawStartPosOnGlobalMap(MapTransformerBase): def __init__(self, source_map_size, world_in_map_size, lamda=0.2): super(DrawStartPosOnGlobalMap, self).__init__(source_map_size, world_in_map_size) self.map_size = source_map_size self.world_size = world_in_map_size self.child_transformer = MapTransformerBase(source_map_size, world_in_map_size) self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.start_pose = None self.last_emb = None self.dbg_t = None self.seq = 0 def init_weights(self): pass def reset(self): super(DrawStartPosOnGlobalMap, self).reset() self.start_pose = None self.last_emb = None self.child_transformer.reset() self.seq = 0 def cuda(self, device=None): MapTransformerBase.cuda(self, device) self.child_transformer.cuda(device) return self def get_start_poses(self, cam_poses_w, sentence_embeddings): # For each timestep, get the pose corresponding to the start of the instruction segment seq_len = len(sentence_embeddings) start_poses = [] for i in range(seq_len): if self.last_emb is not None and (sentence_embeddings[i].data == self.last_emb).all(): pass # Keep the same start pose since we're on the same segment else: self.last_emb = sentence_embeddings[i].data self.start_pose = cam_poses_w[i] start_poses.append(self.start_pose) return start_poses def forward(self, maps_w, sentence_embeddings, map_poses_w, cam_poses_w, show=False): #show="li self.prof.tick(".") batch_size = len(maps_w) # Initialize the layers of the same size as the maps, but with only one channel new_layer_size = list(maps_w.size()) new_layer_size[1] = 1 all_maps_out_w = empty_float_tensor(new_layer_size, self.is_cuda, self.cuda_device) start_poses = self.get_start_poses(cam_poses_w, sentence_embeddings) poses_img = [poses_as_to_img(as_pose, self.world_size) for as_pose in start_poses] #poses_img = poses_as_to_img(start_poses, self.world_size, batch_dim=True) for i in range(batch_size): x = min(max(int(poses_img[i].position.data[0]), 0), new_layer_size[2] - 1) y = min(max(int(poses_img[i].position.data[1]), 0), new_layer_size[2] - 1) all_maps_out_w[i, 0, x, y] = 10.0 if show != "": Presenter().show_image(all_maps_out_w[0], show, torch=True, waitkey=1) self.prof.tick("draw") # Step 3: Convert all maps to local frame maps_out = torch.cat([Variable(all_maps_out_w), maps_w], dim=1) #all_maps_w = torch.cat(all_maps_out_w, dim=0) self.prof.loop() self.prof.print_stats(10) return maps_out, map_poses_w
def __init__(self, run_name="", domain="sim"): super(PVN_Stage1_Bidomain_Original, self).__init__() self.model_name = "pvn_stage1" self.run_name = run_name self.domain = domain self.writer = LoggingSummaryWriter( log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}") #self.writer = DummySummaryWriter() self.root_params = get_current_parameters()["ModelPVN"] self.params = self.root_params["Stage1"] self.use_aux = self.root_params["UseAux"] self.aux_weights = self.root_params["AuxWeights"] if self.params.get("weight_override"): aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride" aux_weights_override = self.root_params.get( aux_weights_override_name) if aux_weights_override: print( f"Overriding auxiliary weights for domain: {self.domain}") self.aux_weights = dict_merge(self.aux_weights, aux_weights_override) self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.tensor_store = KeyTensorStore() self.losses = AuxiliaryLosses() # Auxiliary Objectives self.do_perturb_maps = self.params["perturb_maps"] print("Perturbing maps: ", self.do_perturb_maps) # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.num_feature_channels = self.params[ "feature_channels"] # + params["relevance_channels"] self.num_map_channels = self.params["pathpred_in_channels"] self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], cam_h_fov=self.params["cam_h_fov"], domain=domain, img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.add_init_pos_to_coverage = AddDroneInitPosToCoverage( world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], map_size_px=self.params["local_map_size"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame self.map_processor_grounding = LangFilterMapProcessor( embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=False) ratio_prior_channels = self.params["feature_channels"] # Process the global accumulated map self.path_predictor_lingunet = RatioPathPredictor( self.params["lingunet"], prior_channels_in=self.params["feature_channels"], posterior_channels_in=self.params["pathpred_in_channels"], dual_head=self.params["predict_confidence"], compute_prior=self.params["compute_prior"], use_prior=self.params["use_prior_only"], oob=self.params["clip_observability"]) print("UNet Channels: " + str(self.num_map_channels)) print("Feature Channels: " + str(self.num_feature_channels)) # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc) self.second_transform = self.do_perturb_maps or self.params[ "predict_in_start_frame"] # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"], self.params["emb_dropout"]) self.map_transform_local_to_local = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_global_to_local = MapTransformer( source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_local_to_global = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_s_to_p = self.map_transform_local_to_local self.map_transform_w_to_s = self.map_transform_global_to_local self.map_transform_w_to_r = self.map_transform_global_to_local self.map_transform_r_to_s = self.map_transform_local_to_local self.map_transform_r_to_w = self.map_transform_local_to_global self.map_transform_p_to_w = self.map_transform_local_to_global self.map_transform_p_to_r = self.map_transform_local_to_local # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing( self.params["local_map_size"], self.params["world_size_px"], self.params["world_size_m"]) self.spatialsoftmax = SpatialSoftmax2d() self.visitation_softmax = VisitationSoftmax() #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2 # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if self.use_aux["class_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_features", self.params["feature_channels"], self.params["num_landmarks"], 0, "fpv_features", "lm_pos_fpv", "lm_indices")) if self.use_aux["grounding_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_features", self.params["relevance_channels"], 2, 0, "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if self.use_aux["class_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_map", self.params["feature_channels"], self.params["num_landmarks"], 0, "S_W_select", "lm_pos_map_select", "lm_indices_select")) if self.use_aux["grounding_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_map", self.params["relevance_channels"], 2, 0, "R_W_select", "lm_pos_map_select", "lm_mentioned_select")) # CoRL model uses alignment-model groundings if self.use_aux["lang"]: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.losses.add_auxiliary( ClassAuxiliary("lang", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux["regularize_map"]: self.losses.add_auxiliary( FeatureRegularizationAuxiliary2D("regularize_map", "l1", "S_W_select")) lossfunc = self.params["path_loss_function"] if self.params["clip_observability"]: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) else: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) self.goal_good_criterion = GoalPredictionGoodCriterion( ok_distance=self.params["world_size_px"] * 0.1) self.goal_acc_meter = MovingAverageMeter(10) self.visible_goal_acc_meter = MovingAverageMeter(10) self.invisible_goal_acc_meter = MovingAverageMeter(10) self.visible_goal_frac_meter = MovingAverageMeter(10) self.losses.print_auxiliary_info() self.total_goals = 0 self.correct_goals = 0 self.env_id = None self.env_img = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.should_save_path_overlays = False
def train_epoch(self, train_data=None, train_envs=None, eval=False): if eval: self.model.eval() inference_type = "eval" epoch_num = self.train_epoch_num self.test_epoch_num += 1 else: self.model.train() inference_type = "train" epoch_num = self.train_epoch_num self.train_epoch_num += 1 # import pdb; pdb.set_trace() # print("train_envs:", train_envs) - prints like 4200 environments dataset = self.model.get_dataset(data=train_data, envs=train_envs, dataset_name="supervised", eval=eval) print("code2 dataset type:", type(dataset)) print("dataset:", dataset) print("dataset env_list:", dataset.env_list) # TODO: Get rid of this: if hasattr(dataset, "set_word2token"): dataset.set_word2token(self.token2word, self.word2token) dataloader = DataLoader(dataset, collate_fn=dataset.collate_fn, batch_size=self.batch_size, shuffle=True, num_workers=0, pin_memory=False, timeout=0, drop_last=False) num_samples = len(dataset) if num_samples == 0: print("DATASET HAS NO DATA!") return -1.0 num_batches = int( (num_samples + self.batch_size - 1) / self.batch_size) epoch_loss = 0 count = 0 prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) prof.tick("out") #import pdb;pdb.set_trace() #try: for batch in dataloader: # For debugging batch_size # import pdb; pdb.set_trace() if batch is None: #print("None batch!") continue prof.tick("batch_load") # Zero gradients before each segment and initialize zero segment loss self.optim.zero_grad() #try: if True: batch_loss = self.model.sup_loss_on_batch(batch, eval) if type(batch_loss) == int: print("Ding") prof.tick("forward") # Backprop and step if not eval: batch_loss.backward() prof.tick("backward") # Changed this slightly from original for testing # This is SLOW! Don't do it often # TODO: Get rid of tensorboard if self.batch_num % 20 == 1: params = self.model.named_parameters() # self.write_grad_summaries(self.writer, params, self.batch_num) self.logger.scalar_summary("loss", batch_loss.item(), self.batch_num) self.batch_num += 1 self.optim.step() prof.tick("optim") # Get losses as floats epoch_loss += batch_loss.data[0] count += 1 sys.stdout.write("\r Batch:" + str(count) + " / " + str(num_batches) + " loss: " + str(batch_loss.data[0])) sys.stdout.flush() self.train_segment += 0 if eval else 1 self.test_segment += 1 if eval else 0 prof.tick("rep") prof.loop() prof.print_stats(10) #except Exception as e: # print("Exception encountered during batch update") # print(e) #except Exception as e: # print("Error during epoch training") # print(e) # return if hasattr(self.model, "write_eoe_summaries"): self.model.write_eoe_summaries(inference_type, epoch_num) print("") epoch_loss /= (count + 1e-15) if hasattr(self.model, "writer"): self.model.writer.add_scalar( self.name + "/" + inference_type + "_epoch_loss", epoch_loss, epoch_num) return epoch_loss
class PVN_Stage1_Bidomain_Original(nn.Module): def __init__(self, run_name="", domain="sim"): super(PVN_Stage1_Bidomain_Original, self).__init__() self.model_name = "pvn_stage1" self.run_name = run_name self.domain = domain self.writer = LoggingSummaryWriter( log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}") #self.writer = DummySummaryWriter() self.root_params = get_current_parameters()["ModelPVN"] self.params = self.root_params["Stage1"] self.use_aux = self.root_params["UseAux"] self.aux_weights = self.root_params["AuxWeights"] if self.params.get("weight_override"): aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride" aux_weights_override = self.root_params.get( aux_weights_override_name) if aux_weights_override: print( f"Overriding auxiliary weights for domain: {self.domain}") self.aux_weights = dict_merge(self.aux_weights, aux_weights_override) self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.tensor_store = KeyTensorStore() self.losses = AuxiliaryLosses() # Auxiliary Objectives self.do_perturb_maps = self.params["perturb_maps"] print("Perturbing maps: ", self.do_perturb_maps) # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.num_feature_channels = self.params[ "feature_channels"] # + params["relevance_channels"] self.num_map_channels = self.params["pathpred_in_channels"] self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], cam_h_fov=self.params["cam_h_fov"], domain=domain, img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.add_init_pos_to_coverage = AddDroneInitPosToCoverage( world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], map_size_px=self.params["local_map_size"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame self.map_processor_grounding = LangFilterMapProcessor( embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=False) ratio_prior_channels = self.params["feature_channels"] # Process the global accumulated map self.path_predictor_lingunet = RatioPathPredictor( self.params["lingunet"], prior_channels_in=self.params["feature_channels"], posterior_channels_in=self.params["pathpred_in_channels"], dual_head=self.params["predict_confidence"], compute_prior=self.params["compute_prior"], use_prior=self.params["use_prior_only"], oob=self.params["clip_observability"]) print("UNet Channels: " + str(self.num_map_channels)) print("Feature Channels: " + str(self.num_feature_channels)) # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc) self.second_transform = self.do_perturb_maps or self.params[ "predict_in_start_frame"] # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"], self.params["emb_dropout"]) self.map_transform_local_to_local = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_global_to_local = MapTransformer( source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_local_to_global = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_s_to_p = self.map_transform_local_to_local self.map_transform_w_to_s = self.map_transform_global_to_local self.map_transform_w_to_r = self.map_transform_global_to_local self.map_transform_r_to_s = self.map_transform_local_to_local self.map_transform_r_to_w = self.map_transform_local_to_global self.map_transform_p_to_w = self.map_transform_local_to_global self.map_transform_p_to_r = self.map_transform_local_to_local # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing( self.params["local_map_size"], self.params["world_size_px"], self.params["world_size_m"]) self.spatialsoftmax = SpatialSoftmax2d() self.visitation_softmax = VisitationSoftmax() #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2 # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if self.use_aux["class_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_features", self.params["feature_channels"], self.params["num_landmarks"], 0, "fpv_features", "lm_pos_fpv", "lm_indices")) if self.use_aux["grounding_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_features", self.params["relevance_channels"], 2, 0, "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if self.use_aux["class_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_map", self.params["feature_channels"], self.params["num_landmarks"], 0, "S_W_select", "lm_pos_map_select", "lm_indices_select")) if self.use_aux["grounding_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_map", self.params["relevance_channels"], 2, 0, "R_W_select", "lm_pos_map_select", "lm_mentioned_select")) # CoRL model uses alignment-model groundings if self.use_aux["lang"]: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.losses.add_auxiliary( ClassAuxiliary("lang", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux["regularize_map"]: self.losses.add_auxiliary( FeatureRegularizationAuxiliary2D("regularize_map", "l1", "S_W_select")) lossfunc = self.params["path_loss_function"] if self.params["clip_observability"]: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) else: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) self.goal_good_criterion = GoalPredictionGoodCriterion( ok_distance=self.params["world_size_px"] * 0.1) self.goal_acc_meter = MovingAverageMeter(10) self.visible_goal_acc_meter = MovingAverageMeter(10) self.invisible_goal_acc_meter = MovingAverageMeter(10) self.visible_goal_frac_meter = MovingAverageMeter(10) self.losses.print_auxiliary_info() self.total_goals = 0 self.correct_goals = 0 self.env_id = None self.env_img = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.should_save_path_overlays = False def make_picklable(self): self.writer = DummySummaryWriter() def steal_cross_domain_modules(self, other_self): self.iter = other_self.iter self.losses = other_self.losses self.sentence_embedding = other_self.sentence_embedding self.map_accumulator_w = other_self.map_accumulator_w self.map_processor_grounding = other_self.map_processor_grounding self.path_predictor_lingunet = other_self.path_predictor_lingunet #self.img_to_features_w = other_self.img_to_features_w def both_domain_parameters(self, other_self): # This function iterates and yields parameters from this module and the other module, but does not yield # shared parameters twice. # First yield all of the other module's parameters for p in other_self.parameters(): yield p # Then yield all the parameters from the this module that are not shared with the other one for p in self.img_to_features_w.parameters(): yield p return def get_iter(self): return int(self.iter.data[0]) def inc_iter(self): self.iter += 1 def load_state_dict(self, state_dict, strict=True): super(PVN_Stage1_Bidomain_Original, self).load_state_dict(state_dict, strict) def init_weights(self): self.img_to_features_w.init_weights() self.map_accumulator_w.init_weights() self.sentence_embedding.init_weights() self.map_processor_grounding.init_weights() self.path_predictor_lingunet.init_weights() def reset(self): # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules self.tensor_store.reset() self.sentence_embedding.reset() self.img_to_features_w.reset() self.map_accumulator_w.reset() self.map_batch_fill_missing.reset() self.prev_instruction = None def setEnvContext(self, context): print("Set env context to: " + str(context)) self.env_id = context["env_id"] self.env_img = env.load_env_img(self.env_id, 256, 256) self.env_img = self.env_img[:, :, [2, 1, 0]] def set_save_path_overlays(self, save_path_overlays): self.should_save_path_overlays = save_path_overlays #TODO:O Figure out what to do with save_ground_truth_overlays def print_metrics(self): print(f"Model {self.model_name}:{self.domain} metrics:") print( f" Goal accuracy: {float(self.correct_goals) / self.total_goals}" ) def goal_visible(self, masks, goal_pos): goal_mask = masks.detach()[0, 0, :, :] goal_pos = goal_pos[0].long().detach() visible = bool( (goal_mask[goal_pos[0], goal_pos[1]] > 0.5).detach().cpu().item()) return visible # This is called before beginning an execution sequence def start_sequence(self): self.seq_step = 0 self.reset() return def cam_poses_from_states(self, states): cam_pos = states[:, 9:12] cam_rot = states[:, 12:16] pose = Pose(cam_pos, cam_rot) return pose def forward(self, images, states, instructions, instr_lengths, plan=None, noisy_start_poses=None, start_poses=None, firstseg=None, select_only=True, halfway=False, grad_noise=False, rl=False): """ :param images: BxCxHxW batch of images (observations) :param states: BxK batch of drone states :param instructions: BxM LongTensor where M is the maximum length of any instruction :param instr_lengths: list of len B of integers, indicating length of each instruction :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time :param start_poses: list of drone start poses (these should be equal in practice) :param firstseg: list of booleans indicating True if a new segment starts at that timestep :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True) :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution :return: """ cam_poses = self.cam_poses_from_states(states) g_poses = None # None pose is a placeholder for the canonical global pose. self.prof.tick("out") self.tensor_store.keep_inputs("fpv", images) # Calculate the instruction embedding if instructions is not None: # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state # TODO: There's an assumption here that there's only a single instruction in the batch and it doesn't change # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP if self.params["ignore_instruction"]: # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token sent_embeddings = self.sentence_embedding( torch.zeros_like(instructions[0:1, 0:1]), torch.ones_like(instr_lengths[0:1])) else: sent_embeddings = self.sentence_embedding( instructions[0:1], instr_lengths[0:1]) self.tensor_store.keep_inputs("sentence_embed", sent_embeddings) else: sent_embeddings = self.sentence_embedding.get() self.prof.tick("embed") # Extract and project features onto the egocentric frame for each image F_W, M_W = self.img_to_features_w(images, cam_poses, sent_embeddings, self.tensor_store, show="", halfway=halfway) # For training the critic, this is as far as we need to poceed with the computation. # self.img_to_features_w has stored computed feature maps inside the tensor store, which will then be retrieved by the critic if halfway == True: # Warning: halfway must be True not truthy return None, None self.tensor_store.keep_inputs("F_w", F_W) self.tensor_store.keep_inputs("M_w", M_W) self.prof.tick("img_to_map_frame") # Accumulate the egocentric features in a global map reset_mask = firstseg if self.params["clear_history"] else None # Consider the space very near the drone and right under it as observed - draw ones on the observability mask # If we treat that space as unobserved, then there's going to be a gap in the visitation distribution, which # makes training with RL more difficult, as there is no reward feedback if the drone doesn't cross that gap. if self.params.get("cover_init_pos", False): StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks( M_W.shape[0], M_W.device) StartMasks_W, _ = self.map_transform_r_to_w( StartMasks_R, cam_poses, None) M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W) S_W, SM_W = self.map_accumulator_w(F_W, M_W, reset_mask=reset_mask, show="acc" if IMG_DBG else "") S_W_poses = g_poses self.prof.tick("map_accumulate") # If we're training Stage 2 with imitation learning from ground truth visitation distributions, we want to # compute observability masks with the same code that's used in Stage 1 to avoid mistakes. if halfway == "observability": map_uncoverage_w = 1 - SM_W return map_uncoverage_w # Throw away those timesteps that don't correspond to planning timesteps S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \ self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan) #maps_m_prior_select, maps_m_posterior_select = None, None # Only process the maps on plannieng timesteps if len(S_W_select) == 0: return None self.tensor_store.keep_inputs("S_W_select", S_W_select) self.prof.tick("batch_select") # Process the map via the two map_procesors # Do grounding of objects in the map chosen to do so if self.use_aux["grounding_map"]: R_W_select, RS_W_poses_select = self.map_processor_grounding( S_W_select, sent_embeddings_select, S_W_poses_select, show="") self.tensor_store.keep_inputs("R_W_select", R_W_select) self.prof.tick("map_proc_gnd") # Concatenate grounding map and semantic map along channel dimension RS_W_select = torch.cat([S_W_select, R_W_select], 1) else: RS_W_select = S_W_select RS_W_poses_select = S_W_poses_select s_poses_select = start_poses_select if self.params[ "predict_in_start_frame"] else cam_poses_select RS_S_select, RS_S_poses_select = self.map_transform_w_to_s( RS_W_select, RS_W_poses_select, s_poses_select) SM_S_select, SM_S_poses_select = self.map_transform_w_to_s( SM_W_select, S_W_poses_select, s_poses_select) assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame" self.tensor_store.keep_inputs("RS_S_select", RS_S_select) self.tensor_store.keep_inputs("SM_S_select", SM_S_select) self.prof.tick("transform_w_to_s") # Data augmentation for trajectory prediction map_poses_clean_select = None # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses if self.do_perturb_maps: assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps" RS_P_select, RS_P_poses_select = self.map_transform_s_to_p( RS_S_select, RS_S_poses_select, noisy_start_poses_select) else: RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select) self.prof.tick("map_perturb") sent_embeddings_pp = sent_embeddings_select # Run lingunet on the map to predict visitation distribution scores (pre-softmax) # --------- log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet( RS_P_select, sent_embeddings_pp, RS_P_poses_select, tensor_store=self.tensor_store) # --------- self.prof.tick("pathpred") # TODO: Shouldn't we be transforming probability distributions instead of scores? Otherwise OOB space will have weird values # Transform distributions back to world reference frame and keep them (these are the model outputs) both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, None) log_v_dist_w_select = Partial2DDistribution( both_inner_w, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_w_select", log_v_dist_w_select) # Transform distributions back to start reference frame and keep them (for auxiliary objective) both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, start_poses_select) log_v_dist_s_select = Partial2DDistribution( both_inner_s, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_s_select", log_v_dist_s_select) # prime number will mean that it will alternate between sim and real if self.get_iter() % 23 == 0: lsfm = SpatialSoftmax2d() for i in range(S_W_select.shape[0]): Presenter().show_image(S_W_select.detach().cpu()[i, 0:3], f"{self.domain}_s_w_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_s_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_s_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_p_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_p_select", scale=4, waitkey=1) Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3], f"{self.domain}_rs_p_select", scale=4, waitkey=1) break self.prof.tick("transform_back") # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep # that had a trajectory prediction, rotated in the frame of timestep k. if select_only: # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps log_v_dist_w = log_v_dist_w_select v_dist_w_poses = v_dist_w_poses_select else: raise NotImplementedError("select_only must be True") return_list = [log_v_dist_w, v_dist_w_poses] if rl: internals_for_rl = { "map_coverage_w": SM_W, "map_uncoverage_w": 1 - SM_W } return_list.append(internals_for_rl) return tuple(return_list) def maybe_cuda(self, tensor): return tensor.to(next(self.parameters()).device) def cuda_var(self, tensor): return tensor.to(next(self.parameters()).device) def unbatch(self, batch, halfway=False): # Inputs images = self.maybe_cuda(batch["images"][0]) seq_len = len(images) instructions = self.maybe_cuda(batch["instr"][0][:seq_len]) instr_lengths = batch["instr_len"][0][:seq_len] states = self.maybe_cuda(batch["states"][0]) if not halfway: plan_mask = batch["plan_mask"][ 0] # True for every timestep that we do visitation prediction firstseg_mask = batch["firstseg_mask"][ 0] # True for every timestep that is a new instruction segment # Labels (including for auxiliary losses) lm_pos_fpv = batch["lm_pos_fpv"][ 0] # All object 2D coordinates in the first-person image lm_pos_map_m = batch["lm_pos_map"][ 0] # All object 2D coordinates in the semantic map lm_indices = batch["lm_indices"][0] # All object class indices goal_pos_map_m = batch["goal_loc"][ 0] # Goal location in the world in meters_and_metrics lm_mentioned = batch["lm_mentioned"][ 0] # 1/0 labels whether object was mentioned/not mentioned in template instruction # TODO: We're taking the FIRST label here. SINGLE SEGMENT ASSUMPTION lang_lm_mentioned = batch["lang_lm_mentioned"][0][ 0] # integer labes as to which object was mentioned start_poses = batch["start_poses"][0] noisy_start_poses = get_noisy_poses_torch( start_poses.numpy(), self.params["pos_variance"], self.params["rot_variance"], cuda=False, cuda_device=None) # Ground truth visitation distributions (in start and global frames) v_dist_w_ground_truth_select = self.maybe_cuda( batch["traj_ground_truth"][0]) start_poses_select = self.batch_select.one( start_poses, plan_mask, v_dist_w_ground_truth_select.device) v_dist_s_ground_truth_select, poses_s = self.map_transform_w_to_s( v_dist_w_ground_truth_select, None, start_poses_select) #self.tensor_store.keep_inputs("v_dist_w_ground_truth_select", v_dist_w_ground_truth_select) self.tensor_store.keep_inputs("v_dist_s_ground_truth_select", v_dist_s_ground_truth_select) #Presenter().show_image(v_dist_s_ground_truth_select.detach().cpu()[0,0], "v_dist_s_ground_truth_select", waitkey=1, scale=4) #Presenter().show_image(v_dist_w_ground_truth_select.detach().cpu()[0,0], "v_dist_w_ground_truth_select", waitkey=1, scale=4) lm_pos_map_px = [ torch.from_numpy( transformations.pos_m_to_px(p.numpy(), self.params["global_map_size"], self.params["world_size_m"], self.params["world_size_px"])) if p is not None else None for p in lm_pos_map_m ] goal_pos_map_px = torch.from_numpy( transformations.pos_m_to_px(goal_pos_map_m.numpy(), self.params["global_map_size"], self.params["world_size_m"], self.params["world_size_px"])) resnet_factor = self.img_to_features_w.img_to_features.get_downscale_factor( ) lm_pos_fpv = [ self.cuda_var( (s / resnet_factor).long()) if s is not None else None for s in lm_pos_fpv ] lm_indices = [ self.cuda_var(s) if s is not None else None for s in lm_indices ] lm_mentioned = [ self.cuda_var(s) if s is not None else None for s in lm_mentioned ] lang_lm_mentioned = self.cuda_var(lang_lm_mentioned) lm_pos_map_px = [ self.cuda_var(s.long()) if s is not None else None for s in lm_pos_map_px ] goal_pos_map_px = self.cuda_var(goal_pos_map_px) self.tensor_store.keep_inputs("lm_pos_fpv", lm_pos_fpv) self.tensor_store.keep_inputs("lm_pos_map", lm_pos_map_px) self.tensor_store.keep_inputs("lm_indices", lm_indices) self.tensor_store.keep_inputs("lm_mentioned", lm_mentioned) self.tensor_store.keep_inputs("lang_lm_mentioned", lang_lm_mentioned) self.tensor_store.keep_inputs("goal_pos_map", goal_pos_map_px) lm_pos_map_select = [ lm_pos for i, lm_pos in enumerate(lm_pos_map_px) if plan_mask[i] ] lm_indices_select = [ lm_idx for i, lm_idx in enumerate(lm_indices) if plan_mask[i] ] lm_mentioned_select = [ lm_m for i, lm_m in enumerate(lm_mentioned) if plan_mask[i] ] goal_pos_map_select = [ pos for i, pos in enumerate(goal_pos_map_px) if plan_mask[i] ] self.tensor_store.keep_inputs("lm_pos_map_select", lm_pos_map_select) self.tensor_store.keep_inputs("lm_indices_select", lm_indices_select) self.tensor_store.keep_inputs("lm_mentioned_select", lm_mentioned_select) self.tensor_store.keep_inputs("goal_pos_map_select", goal_pos_map_select) # We won't need this extra information else: noisy_poses, start_poses, noisy_start_poses = None, None, None plan_mask, firstseg_mask = None, None metadata = batch["md"][0][0] env_id = metadata["env_id"] self.tensor_store.set_flag("env_id", env_id) return images, states, instructions, instr_lengths, plan_mask, firstseg_mask, start_poses, noisy_start_poses, metadata # Forward pass for training def sup_loss_on_batch(self, batch, eval, halfway=False, grad_noise=False, disable_losses=[]): self.prof.tick("out") self.reset() if batch is None: print("Skipping None Batch") zero = torch.zeros([1]).float().to(next(self.parameters()).device) return zero, self.tensor_store images, states, instructions, instr_len, plan_mask, firstseg_mask, \ start_poses, noisy_start_poses, metadata = self.unbatch(batch, halfway=halfway) self.prof.tick("unbatch_inputs") # ---------------------------------------------------------------------------- _ = self(images, states, instructions, instr_len, plan=plan_mask, firstseg=firstseg_mask, noisy_start_poses=start_poses if eval else noisy_start_poses, start_poses=start_poses, select_only=True, halfway=halfway, grad_noise=grad_noise) # ---------------------------------------------------------------------------- if self.should_save_path_overlays: self.save_path_overlays(metadata) # If we run the model halfway, we only need to calculate features needed for the wasserstein loss # If we want to include more features in wasserstein critic, have to run the forward pass a bit further if halfway and not halfway == "v2": return None, self.tensor_store # The returned values are not used here - they're kept in the tensor store which is used as an input to a loss self.prof.tick("call") if not halfway: # Calculate goal-prediction accuracy: goal_pos = self.tensor_store.get_inputs_batch("goal_pos_map", cat_not_stack=True) success_goal = self.goal_good_criterion( self.tensor_store.get_inputs_batch("log_v_dist_w_select", cat_not_stack=True), goal_pos) acc = 1.0 if success_goal else 0.0 self.goal_acc_meter.put(acc) goal_visible = self.goal_visible( self.tensor_store.get_inputs_batch("M_w", cat_not_stack=True), goal_pos) if goal_visible: self.visible_goal_acc_meter.put(acc) else: self.invisible_goal_acc_meter.put(acc) self.visible_goal_frac_meter.put(1.0 if goal_visible else 0.0) self.correct_goals += acc self.total_goals += 1 self.prof.tick("goal_acc") if halfway == "v2": disable_losses = ["visitation_dist", "lang"] losses, metrics = self.losses.calculate_aux_loss( tensor_store=self.tensor_store, reduce_average=True, disable_losses=disable_losses) loss = self.losses.combine_losses(losses, self.aux_weights) self.prof.tick("calc_losses") prefix = self.model_name + ("/eval" if eval else "/train") iteration = self.get_iter() self.writer.add_dict(prefix, get_current_meters(), iteration) self.writer.add_dict(prefix, losses, iteration) self.writer.add_dict(prefix, metrics, iteration) if not halfway: self.writer.add_scalar(prefix + "/goal_accuracy", self.goal_acc_meter.get(), iteration) self.writer.add_scalar(prefix + "/visible_goal_accuracy", self.visible_goal_acc_meter.get(), iteration) self.writer.add_scalar(prefix + "/invisible_goal_accuracy", self.invisible_goal_acc_meter.get(), iteration) self.writer.add_scalar(prefix + "/visible_goal_fraction", self.visible_goal_frac_meter.get(), iteration) self.inc_iter() self.prof.tick("summaries") self.prof.loop() self.prof.print_stats(1) return loss, self.tensor_store def get_dataset(self, data=None, envs=None, domain=None, dataset_names=None, dataset_prefix=None, eval=False, halfway_only=False): # TODO: Maybe use eval here data_sources = [] # If we're running auxiliary objectives, we need to include the data sources for the auxiliary labels #if self.use_aux_class_features or self.use_aux_class_on_map or self.use_aux_grounding_features or self.use_aux_grounding_on_map: #if self.use_aux_goal_on_map: if not halfway_only: data_sources.append(aup.PROVIDER_LM_POS_DATA) data_sources.append(aup.PROVIDER_GOAL_POS) # Adding these in this order will compute poses with added noise and compute trajectory ground truth # in the reference frame of these noisy poses data_sources.append(aup.PROVIDER_START_POSES) if self.do_perturb_maps: print("PERTURBING MAPS!") # TODO: The noisy poses from the provider are not actually used!! Those should replace states instead! data_sources.append(aup.PROVIDER_NOISY_POSES) # TODO: Think this through. Perhaps we actually want dynamic ground truth given a noisy start position if self.params["predict_in_start_frame"]: data_sources.append( aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_STATIC) else: data_sources.append( aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_DYNAMIC_NOISY) else: print("NOT Perturbing Maps!") data_sources.append(aup.PROVIDER_NOISY_POSES) if self.params["predict_in_start_frame"]: data_sources.append( aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_STATIC) else: data_sources.append( aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_DYNAMIC) data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED) templates = get_current_parameters()["Environment"]["templates"] if templates: data_sources.append(aup.PROVIDER_LANG_TEMPLATE) return SegmentDataset(data=data, env_list=envs, domain=domain, dataset_names=dataset_names, dataset_prefix=dataset_prefix, aux_provider_names=data_sources, segment_level=True)
def __init__(self, run_name="", model_class=MODEL_RSS, aux_class_features=False, aux_grounding_features=False, aux_class_map=False, aux_grounding_map=False, aux_goal_map=False, aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False): super(ModelTrajectoryTopDown, self).__init__() self.model_name = "sm_trajectory" + str(model_class) self.model_class = model_class print("Init model of type: ", str(model_class)) self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Auxiliary Objectives self.use_aux_class_features = aux_class_features self.use_aux_grounding_features = aux_grounding_features self.use_aux_class_on_map = aux_class_map self.use_aux_grounding_on_map = aux_grounding_map self.use_aux_goal_on_map = aux_goal_map self.use_aux_lang = aux_lang self.use_aux_traj_on_map = aux_traj self.use_aux_reg_map = self.aux_weights["regularize_map"] self.use_rot_noise = rot_noise self.use_pos_noise = pos_noise # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame if self.use_aux_grounding_on_map and not self.use_aux_grounding_features: self.map_processor_a_w = LangFilterMapProcessor( source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=True) else: self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) if self.use_aux_goal_on_map: self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["relevance_channels"], out_channels=self.params["goal_channels"], spatial=True, cat_out=True) else: self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) pred_channels = self.params["goal_channels"] + self.params["relevance_channels"] # Common # -------------------------------------------------------------------------------------------------------------- # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"]) self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"]) # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False) # Output an action given the global semantic map if self.params["map_to_action"] == "downsample2": self.map_to_action = EgoMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"]) elif self.params["map_to_action"] == "cropped": self.map_to_action = CroppedMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"] ) # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training # and finetuned on all timesteps end-to-end enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True) # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if aux_class_features: self.add_auxiliary(ClassAuxiliary2D("aux_class", None, self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "fpv_features", "lm_pos_fpv", "lm_indices")) if aux_grounding_features: self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"], "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if aux_class_map: self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "map_s_w_select", "lm_pos_map_select", "lm_indices_select")) if aux_grounding_map: self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"], "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select")) if aux_goal_map: self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"], "map_b_w", "goal_pos_map")) # RSS model uses templated data for landmark and side prediction if self.use_aux_lang and self.params["templates"]: self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1, "sentence_embed", "lm_mentioned_tplt")) self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1, "sentence_embed", "side_mentioned_tplt")) # CoRL model uses alignment-model groundings elif self.use_aux_lang: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux_traj_on_map: self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select")) if self.use_aux_reg_map: self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1", "map_s_w_select", "lm_pos_map_select")) self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2) self.goal_acc_meter = MovingAverageMeter(10) self.print_auxiliary_info() self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0
class ModelTrajectoryTopDown(ModuleWithAuxiliaries): def __init__(self, run_name="", model_class=MODEL_RSS, aux_class_features=False, aux_grounding_features=False, aux_class_map=False, aux_grounding_map=False, aux_goal_map=False, aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False): super(ModelTrajectoryTopDown, self).__init__() self.model_name = "sm_trajectory" + str(model_class) self.model_class = model_class print("Init model of type: ", str(model_class)) self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Auxiliary Objectives self.use_aux_class_features = aux_class_features self.use_aux_grounding_features = aux_grounding_features self.use_aux_class_on_map = aux_class_map self.use_aux_grounding_on_map = aux_grounding_map self.use_aux_goal_on_map = aux_goal_map self.use_aux_lang = aux_lang self.use_aux_traj_on_map = aux_traj self.use_aux_reg_map = self.aux_weights["regularize_map"] self.use_rot_noise = rot_noise self.use_pos_noise = pos_noise # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame if self.use_aux_grounding_on_map and not self.use_aux_grounding_features: self.map_processor_a_w = LangFilterMapProcessor( source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=True) else: self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) if self.use_aux_goal_on_map: self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["relevance_channels"], out_channels=self.params["goal_channels"], spatial=True, cat_out=True) else: self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) pred_channels = self.params["goal_channels"] + self.params["relevance_channels"] # Common # -------------------------------------------------------------------------------------------------------------- # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"]) self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"]) # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False) # Output an action given the global semantic map if self.params["map_to_action"] == "downsample2": self.map_to_action = EgoMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"]) elif self.params["map_to_action"] == "cropped": self.map_to_action = CroppedMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"] ) # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training # and finetuned on all timesteps end-to-end enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True) # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if aux_class_features: self.add_auxiliary(ClassAuxiliary2D("aux_class", None, self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "fpv_features", "lm_pos_fpv", "lm_indices")) if aux_grounding_features: self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"], "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if aux_class_map: self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "map_s_w_select", "lm_pos_map_select", "lm_indices_select")) if aux_grounding_map: self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"], "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select")) if aux_goal_map: self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"], "map_b_w", "goal_pos_map")) # RSS model uses templated data for landmark and side prediction if self.use_aux_lang and self.params["templates"]: self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1, "sentence_embed", "lm_mentioned_tplt")) self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1, "sentence_embed", "side_mentioned_tplt")) # CoRL model uses alignment-model groundings elif self.use_aux_lang: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux_traj_on_map: self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select")) if self.use_aux_reg_map: self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1", "map_s_w_select", "lm_pos_map_select")) self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2) self.goal_acc_meter = MovingAverageMeter(10) self.print_auxiliary_info() self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0 # TODO: Try to hide these in a superclass or something. They take up a lot of space: def cuda(self, device=None): ModuleWithAuxiliaries.cuda(self, device) self.sentence_embedding.cuda(device) self.map_accumulator_w.cuda(device) self.map_processor_a_w.cuda(device) self.map_processor_b_r.cuda(device) self.img_to_features_w.cuda(device) self.map_to_action.cuda(device) self.action_loss.cuda(device) self.map_batch_fill_missing.cuda(device) self.map_transform_w_to_r.cuda(device) self.map_transform_r_to_w.cuda(device) self.batch_select.cuda(device) self.map_batch_fill_missing.cuda(device) return self def get_iter(self): return int(self.iter.data[0]) def inc_iter(self): self.iter += 1 def init_weights(self): self.img_to_features_w.init_weights() self.map_accumulator_w.init_weights() self.sentence_embedding.init_weights() self.map_to_action.init_weights() self.map_processor_a_w.init_weights() self.map_processor_b_r.init_weights() def reset(self): # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules super(ModelTrajectoryTopDown, self).reset() self.sentence_embedding.reset() self.img_to_features_w.reset() self.map_accumulator_w.reset() self.map_processor_a_w.reset() self.map_processor_b_r.reset() self.map_transform_w_to_r.reset() self.map_transform_r_to_w.reset() self.map_batch_fill_missing.reset() self.prev_instruction = None def setEnvContext(self, context): print("Set env context to: " + str(context)) self.env_id = context["env_id"] def save_viz(self, images_in): imsave(get_viz_dir() + "fpv_" + str(self.seq_step) + ".png", images_in) features_cam = self.get_inputs_batch("fpv_features")[-1, 0, 0:3] save_tensor_as_img(features_cam, "F_c", self.env_id) feature_map_torch = self.get_inputs_batch("f_w")[-1, 0, 0:3] save_tensor_as_img(feature_map_torch, "F_w", self.env_id) coverage_map_torch = self.get_inputs_batch("m_w")[-1, 0, 0:3] save_tensor_as_img(coverage_map_torch, "M_w", self.env_id) semantic_map_torch = self.get_inputs_batch("map_s_w_select")[-1, 0, 0:3] save_tensor_as_img(semantic_map_torch, "S_w", self.env_id) relmap_torch = self.get_inputs_batch("map_a_w_select")[-1, 0, 0:3] save_tensor_as_img(relmap_torch, "R_w", self.env_id) relmap_r_torch = self.get_inputs_batch("map_a_r_select")[-1, 0, 0:3] save_tensor_as_img(relmap_r_torch, "R_r", self.env_id) goalmap_torch = self.get_inputs_batch("map_b_w_select")[-1, 0, 0:3] save_tensor_as_img(goalmap_torch, "G_w", self.env_id) goalmap_r_torch = self.get_inputs_batch("map_b_r_select")[-1, 0, 0:3] save_tensor_as_img(goalmap_r_torch, "G_r", self.env_id) action = self.get_inputs_batch("action")[-1].data.cpu().squeeze().numpy() action_fname = self.get_viz_dir() + "action_" + str(self.seq_step) + ".png" Presenter().save_action(action, action_fname, "") def get_action(self, state, instruction): """ Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop) :param state: DroneState object with the raw image from the simulator :param instruction: Tokenized instruction given the corpus #TODO: Absorb corpus within model :return: """ # TODO: Simplify this self.eval() images_np_pure = state.image state_np = state.state #print("Act: " + debug_untokenize_instruction(instruction)) images_np = standardize_image(images_np_pure) image_fpv = Variable(none_padded_seq_to_tensor([images_np])) state = Variable(none_padded_seq_to_tensor([state_np])) # Add the batch dimension first_step = True if instruction == self.prev_instruction: first_step = False self.prev_instruction = instruction img_in_t = image_fpv img_in_t.volatile = True instr_len = [len(instruction)] if instruction is not None else None instruction = torch.LongTensor(instruction).unsqueeze(0) instruction = cuda_var(instruction, self.is_cuda, self.cuda_device) state.volatile = True if self.is_cuda: if img_in_t is not None: img_in_t = img_in_t.cuda(self.cuda_device) state = state.cuda(self.cuda_device) step_enc = None plan_now = None self.seq_step += 1 action = self(img_in_t, state, instruction, instr_len, plan=plan_now, pos_enc=step_enc) # Save materials for paper and presentation if False: self.save_viz(images_np_pure) output_action = action.squeeze().data.cpu().numpy() stop_prob = output_action[3] output_stop = 1 if stop_prob > 0.5 else 0 output_action[3] = output_stop return output_action def deterministic_action(self, action_mean, action_std, stop_prob): batch_size = action_mean.size(0) action = Variable(empty_float_tensor((batch_size, 4), self.is_cuda, self.cuda_device)) action[:, 0:3] = action_mean[:, 0:3] action[:, 3] = stop_prob return action def sample_action(self, action_mean, action_std, stop_prob): action = torch.normal(action_mean, action_std) stop = torch.bernoulli(stop_prob) return action, stop # This is called before beginning an execution sequence def start_sequence(self): self.seq_step = 0 self.reset() print("RESETTED!") return # TODO: Move this somewhere and standardize def cam_poses_from_states(self, states): cam_pos = states[:, 9:12] cam_rot = states[:, 12:16] pos_variance = 0 rot_variance = 0 if self.use_pos_noise: pos_variance = self.params["noisy_pos_variance"] if self.use_rot_noise: rot_variance = self.params["noisy_rot_variance"] pose = Pose(cam_pos, cam_rot) if self.use_pos_noise or self.use_rot_noise: pose = get_noisy_poses_torch(pose, pos_variance, rot_variance, cuda=self.is_cuda, cuda_device=self.cuda_device) return pose def forward(self, images, states, instructions, instr_lengths, has_obs=None, plan=None, save_maps_only=False, pos_enc=None, noisy_poses=None): """ :param images: BxCxHxW batch of images (observations) :param states: BxK batch of drone states :param instructions: BxM LongTensor where M is the maximum length of any instruction :param instr_lengths: list of len B of integers, indicating length of each instruction :param has_obs: list of booleans of length B indicating whether the given element in the sequence has an observation :param yield_semantic_maps: If true, will not compute actions (full model), but return the semantic maps that were built along the way in response to the images. This is ugly, but allows code reuse :return: """ cam_poses = self.cam_poses_from_states(states) g_poses = None#[None for pose in cam_poses] self.prof.tick("out") #print("Trn: " + debug_untokenize_instruction(instructions[0].data[:instr_lengths[0]])) # Calculate the instruction embedding if instructions is not None: # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state sent_embeddings = self.sentence_embedding(instructions, instr_lengths) self.keep_inputs("sentence_embed", sent_embeddings) else: sent_embeddings = self.sentence_embedding.get() self.prof.tick("embed") # Extract and project features onto the egocentric frame for each image features_w, coverages_w = self.img_to_features_w(images, cam_poses, sent_embeddings, self, show="") self.prof.tick("img_to_map_frame") self.keep_inputs("f_w", features_w) self.keep_inputs("m_w", coverages_w) # Accumulate the egocentric features in a global map maps_w = self.map_accumulator_w(features_w, coverages_w, add_mask=has_obs, show="acc" if IMG_DBG else "") map_poses_w = g_poses # TODO: Maybe keep maps_w if necessary #self.keep_inputs("map_sm_local", maps_m) self.prof.tick("map_accumulate") # Throw away those timesteps that don't correspond to planning timesteps maps_w_select, map_poses_w_select, cam_poses_select, noisy_poses_select, _, sent_embeddings_select, pos_enc = \ self.batch_select(maps_w, map_poses_w, cam_poses, noisy_poses, None, sent_embeddings, pos_enc, plan) # Only process the maps on planning timesteps if len(maps_w_select) > 0: self.keep_inputs("map_s_w_select", maps_w_select) self.prof.tick("batch_select") # Process the map via the two map_procesors # Do grounding of objects in the map chosen to do so maps_w_select, map_poses_w_select = self.map_processor_a_w(maps_w_select, sent_embeddings_select, map_poses_w_select, show="") self.keep_inputs("map_a_w_select", maps_w_select) self.prof.tick("map_proc_gnd") self.map_transform_w_to_r.set_maps(maps_w_select, map_poses_w_select) maps_m_select, map_poses_m_select = self.map_transform_w_to_r.get_maps(cam_poses_select) self.keep_inputs("map_a_r_select", maps_w_select) self.prof.tick("transform_w_to_r") self.keep_inputs("map_a_r_perturbed_select", maps_m_select) self.prof.tick("map_perturb") # Include positional encoding for path prediction if pos_enc is not None: sent_embeddings_pp = torch.cat([sent_embeddings_select, pos_enc.unsqueeze(1)], dim=1) else: sent_embeddings_pp = sent_embeddings_select # Process the map via the two map_procesors (e.g. predict the trajectory that we'll be taking) maps_m_select, map_poses_m_select = self.map_processor_b_r(maps_m_select, sent_embeddings_pp, map_poses_m_select) self.keep_inputs("map_b_r_select", maps_m_select) if True: self.map_transform_r_to_w.set_maps(maps_m_select, map_poses_m_select) maps_b_w_select, _ = self.map_transform_r_to_w.get_maps(None) self.keep_inputs("map_b_w_select", maps_b_w_select) self.prof.tick("map_proc_b") else: maps_m_select = None maps_m, map_poses_m = self.map_batch_fill_missing(maps_m_select, cam_poses, plan, show="") self.keep_inputs("map_b_r", maps_m) self.prof.tick("map_fill_missing") # Keep global maps for auxiliary objectives if necessary if self.input_required("map_b_w"): maps_b, _ = self.map_processor_b_r.get_maps(g_poses) self.keep_inputs("map_b_w", maps_b) self.prof.tick("keep_global_maps") if run_metadata.IS_ROLLOUT: pass #Presenter().show_image(maps_m.data[0, 0:3], "plan_map_now", torch=True, scale=4, waitkey=1) #Presenter().show_image(maps_w.data[0, 0:3], "sm_map_now", torch=True, scale=4, waitkey=1) self.prof.tick("viz") # Output the final action given the processed map action_pred = self.map_to_action(maps_m, sent_embeddings) out_action = self.deterministic_action(action_pred[:, 0:3], None, action_pred[:, 3]) self.keep_inputs("action", out_action) self.prof.tick("map_to_action") return out_action # TODO: The below two methods seem to do the same thing def maybe_cuda(self, tensor): if self.is_cuda: return tensor.cuda() else: return tensor def cuda_var(self, tensor): return cuda_var(tensor, self.is_cuda, self.cuda_device) # Forward pass for training (with batch optimizations def sup_loss_on_batch(self, batch, eval): self.prof.tick("out") action_loss_total = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device)) if batch is None: print("Skipping None Batch") return action_loss_total images = self.maybe_cuda(batch["images"]) instructions = self.maybe_cuda(batch["instr"]) instr_lengths = batch["instr_len"] states = self.maybe_cuda(batch["states"]) actions = self.maybe_cuda(batch["actions"]) # Auxiliary labels lm_pos_fpv = batch["lm_pos_fpv"] lm_pos_map = batch["lm_pos_map"] lm_indices = batch["lm_indices"] goal_pos_map = batch["goal_loc"] TEMPLATES = True if TEMPLATES: lm_mentioned_tplt = batch["lm_mentioned_tplt"] side_mentioned_tplt = batch["side_mentioned_tplt"] else: lm_mentioned = batch["lm_mentioned"] lang_lm_mentioned = batch["lang_lm_mentioned"] # stops = self.maybe_cuda(batch["stops"]) masks = self.maybe_cuda(batch["masks"]) # This is the first-timestep metadata metadata = batch["md"] seq_len = images.size(1) batch_size = images.size(0) count = 0 correct_goal_count = 0 goal_count = 0 # Loop thru batch for b in range(batch_size): seg_idx = -1 self.reset() self.prof.tick("out") b_seq_len = len_until_nones(metadata[b]) # TODO: Generalize this # Slice the data according to the sequence length b_metadata = metadata[b][:b_seq_len] b_images = images[b][:b_seq_len] b_instructions = instructions[b][:b_seq_len] b_instr_len = instr_lengths[b][:b_seq_len] b_states = states[b][:b_seq_len] b_actions = actions[b][:b_seq_len] b_lm_pos_fpv = lm_pos_fpv[b][:b_seq_len] b_lm_pos_map = lm_pos_map[b][:b_seq_len] b_lm_indices = lm_indices[b][:b_seq_len] b_goal_pos = goal_pos_map[b][:b_seq_len] if not TEMPLATES: b_lang_lm_mentioned = lang_lm_mentioned[b][:b_seq_len] b_lm_mentioned = lm_mentioned[b][:b_seq_len] b_lm_pos_map = [self.cuda_var(s.long()) if s is not None else None for s in b_lm_pos_map] b_lm_pos_fpv = [self.cuda_var((s / RESNET_FACTOR).long()) if s is not None else None for s in b_lm_pos_fpv] b_lm_indices = [self.cuda_var(s) if s is not None else None for s in b_lm_indices] b_goal_pos = self.cuda_var(b_goal_pos) if not TEMPLATES: b_lang_lm_mentioned = self.cuda_var(b_lang_lm_mentioned) b_lm_mentioned = [self.cuda_var(s) if s is not None else None for s in b_lm_mentioned] # TODO: Figure out how to keep these properly. Perhaps as a whole batch is best # TODO: Introduce a key-value store (encapsulate instead of inherit) self.keep_inputs("lm_pos_fpv", b_lm_pos_fpv) self.keep_inputs("lm_pos_map", b_lm_pos_map) self.keep_inputs("lm_indices", b_lm_indices) self.keep_inputs("goal_pos_map", b_goal_pos) if not TEMPLATES: self.keep_inputs("lang_lm_mentioned", b_lang_lm_mentioned) self.keep_inputs("lm_mentioned", b_lm_mentioned) # TODO: Abstract all of these if-elses in a modular way once we know which ones are necessary if TEMPLATES: b_lm_mentioned_tplt = lm_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = side_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = self.cuda_var(b_side_mentioned_tplt) b_lm_mentioned_tplt = self.cuda_var(b_lm_mentioned_tplt) self.keep_inputs("lm_mentioned_tplt", b_lm_mentioned_tplt) self.keep_inputs("side_mentioned_tplt", b_side_mentioned_tplt) b_lm_mentioned = b_lm_mentioned_tplt b_obs_mask = [True for _ in range(b_seq_len)] b_plan_mask = [True for _ in range(b_seq_len)] b_plan_mask_t_cpu = torch.Tensor(b_plan_mask) == True b_plan_mask_t = self.maybe_cuda(b_plan_mask_t_cpu) b_pos_enc = None # ---------------------------------------------------------------------------- # Optional Auxiliary Inputs # ---------------------------------------------------------------------------- if self.input_required("lm_pos_map_select"): b_lm_pos_map_select = [lm_pos for i,lm_pos in enumerate(b_lm_pos_map) if b_plan_mask[i]] self.keep_inputs("lm_pos_map_select", b_lm_pos_map_select) if self.input_required("lm_indices_select"): b_lm_indices_select = [lm_idx for i,lm_idx in enumerate(b_lm_indices) if b_plan_mask[i]] self.keep_inputs("lm_indices_select", b_lm_indices_select) if self.input_required("lm_mentioned_select"): b_lm_mentioned_select = [lm_m for i,lm_m in enumerate(b_lm_mentioned) if b_plan_mask[i]] self.keep_inputs("lm_mentioned_select", b_lm_mentioned_select) # ---------------------------------------------------------------------------- self.prof.tick("inputs") actions = self(b_images, b_states, b_instructions, b_instr_len, has_obs=b_obs_mask, plan=b_plan_mask, pos_enc=b_pos_enc) action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False) self.prof.tick("call") action_losses = self.action_loss.batch_reduce_loss(action_losses) action_loss = self.action_loss.reduce_loss(action_losses) action_loss_total = action_loss count += b_seq_len self.prof.tick("loss") action_loss_avg = action_loss_total / (count + 1e-9) self.prof.tick("out") # Doing this in the end (outside of se aux_losses = self.calculate_aux_loss(reduce_average=True) aux_loss = self.combine_aux_losses(aux_losses, self.aux_weights) prefix = self.model_name + ("/eval" if eval else "/train") self.writer.add_dict(prefix, get_current_meters(), self.get_iter()) self.writer.add_dict(prefix, aux_losses, self.get_iter()) self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu()[0], self.get_iter()) # TODO: Log value here self.writer.add_scalar(prefix + "/goal_accuracy", self.goal_acc_meter.get(), self.get_iter()) self.prof.tick("auxiliaries") total_loss = action_loss_avg + aux_loss self.inc_iter() self.prof.tick("summaries") self.prof.loop() self.prof.print_stats(1) return total_loss def get_dataset(self, data=None, envs=None, dataset_name=None, eval=False): # TODO: Maybe use eval here #if self.fpv: data_sources = [] # If we're running auxiliary objectives, we need to include the data sources for the auxiliary labels #if self.use_aux_class_features or self.use_aux_class_on_map or self.use_aux_grounding_features or self.use_aux_grounding_on_map: #if self.use_aux_goal_on_map: data_sources.append(aup.PROVIDER_LM_POS_DATA) data_sources.append(aup.PROVIDER_GOAL_POS) #data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED) data_sources.append(aup.PROVIDER_LANG_TEMPLATE) #if self.use_rot_noise or self.use_pos_noise: # data_sources.append(aup.PROVIDER_POSE_NOISE) return SegmentDataset(data=data, env_list=envs, dataset_name=dataset_name, aux_provider_names=data_sources, segment_level=True)
class ModelChaplot(ModuleWithAuxiliaries): def __init__(self, run_name=""): super(ModelChaplot, self).__init__() self.model_name = "chaplot" self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.trajectory_len = get_current_parameters( )["Setup"]["trajectory_length"] self.image_module = ChaplotImageModule( image_emb_size=self.params["image_emb_size"], input_num_channels=3, image_height=self.params["img_w"], image_width=self.params["img_h"], using_recurrence=True) self.image_recurrence_module = IncrementalRecurrenceChaplotModule( input_emb_dim=256, output_emb_dim=256) self.text_module = ChaplotTextModule( emb_dim=self.params["word_emb_size"], hidden_dim=self.params["emb_size"], vocab_size=self.params["vocab_size"], image_height=2, image_width=6) # TODO: check image width and height self.final_module = IncrementalMultimodalChaplotModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, max_episode_length=self.trajectory_len, final_image_height=2, final_image_width=6) self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0 self.model_state = None self.image_emb_seq = None self.state_feature = None # TODO: Try to hide these in a superclass or something. They take up a lot of space: def cuda(self, device=None): ModuleWithAuxiliaries.cuda(self, device) self.image_module.cuda(device) self.image_recurrence_module.cuda(device) self.text_module.cuda(device) self.final_module.cuda(device) self.action_loss.cuda(device) return self def get_iter(self): return int(self.iter.data[0]) def inc_iter(self): self.iter += 1 def init_weights(self): self.text_module.init_weights() self.image_recurrence_module.init_weights() self.image_module.init_weights() self.final_module.init_weights() def reset(self): # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules super(ModelChaplot, self).reset() self.seq_step = 0 self.model_state = None self.image_emb_seq = None self.state_feature = None print("CHAPLOT RESET") pass def setEnvContext(self, context): print("Set env context to: " + str(context)) self.env_id = context["env_id"] def start_segment_rollout(self, *args): self.reset() def get_action(self, state, instruction): """ Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop) :param state: DroneState object with the raw image from the simulator :param instruction: Tokenized instruction given the corpus #TODO: Absorb corpus within model :return: """ # TODO: Simplify this self.eval() images_np_pure = state.image state_np = state.state #print("Act: " + debug_untokenize_instruction(instruction)) images_np = standardize_image(images_np_pure) image_fpv = Variable(none_padded_seq_to_tensor([images_np])) state = Variable(none_padded_seq_to_tensor([state_np])) # Add the batch dimension first_step = True if instruction == self.prev_instruction: first_step = False self.prev_instruction = instruction img_in_t = image_fpv img_in_t.volatile = True instr_len = [len(instruction)] if instruction is not None else None for tok in instruction: if tok >= self.params["vocab_size"] or tok < 0: raise Exception("Word embeddings out of bounds") instruction = torch.LongTensor(instruction).unsqueeze(0) instruction = cuda_var(instruction, self.is_cuda, self.cuda_device) state.volatile = True if self.is_cuda: img_in_t = img_in_t.cuda(self.cuda_device) self.seq_step += 1 action = self(img_in_t, instruction, instr_len) output_action = action.squeeze().data.cpu().numpy() stop_prob = output_action[3] output_stop = 1 if (stop_prob > 0.5 or self.seq_step >= self.trajectory_len - 5) else 0 output_action[3] = output_stop #print("action: ", output_action) return output_action def deterministic_action(self, action_mean, action_std, stop_prob): batch_size = action_mean.size(0) action = Variable( empty_float_tensor((batch_size, 4), self.is_cuda, self.cuda_device)) action[:, 0:3] = action_mean[:, 0:3] action[:, 3] = stop_prob return action def sample_action(self, action_mean, action_std, stop_prob): action = torch.normal(action_mean, action_std) stop = torch.bernoulli(stop_prob) return action, stop # This is called before beginning an execution sequence def start_sequence(self): self.seq_step = 0 self.reset() print("RESETTED!") return # TODO: Move this somewhere and standardize def cam_poses_from_states(self, states): cam_pos = states[:, 9:12] cam_rot = states[:, 12:16] pose = Pose(cam_pos, cam_rot) return pose def instructions_to_dipandrew(self, instructions, instr_lengths): out = [] for i in range(len(instructions)): instr_i = instructions[i:i + 1, 0:instr_lengths[i]] out.append(instr_i) return out def forward(self, images, instructions, instr_lengths): seq_len = len(images) instr_dipandrew = self.instructions_to_dipandrew( instructions, instr_lengths) # Add sequence dimension, since we're treating batches as sequences images = images.unsqueeze(0) all_actions = [] for i in range(seq_len): time_in = np.asarray([self.seq_step]) time_in = Variable( self.maybe_cuda(torch.from_numpy(time_in).long())) action_i, self.model_state = self.final_module( images[0:1, i:i + 1], instr_dipandrew[i], time_in, self.model_state) self.seq_step += 1 all_actions.append(action_i) actions = torch.cat(all_actions, dim=0) return actions def maybe_cuda(self, tensor): if self.is_cuda: return tensor.cuda() else: return tensor def cuda_var(self, tensor): return cuda_var(tensor, self.is_cuda, self.cuda_device) # Forward pass for training (with batch optimizations def sup_loss_on_batch(self, batch, eval): self.prof.tick("out") action_loss_total = Variable( empty_float_tensor([1], self.is_cuda, self.cuda_device)) if batch is None: print("Skipping None Batch") return action_loss_total images = self.maybe_cuda(batch["images"]) instructions = self.maybe_cuda(batch["instr"]) instr_lengths = batch["instr_len"] actions = self.maybe_cuda(batch["actions"]) metadata = batch["md"] batch_size = images.size(0) count = 0 # Loop thru batch for b in range(batch_size): self.reset() self.prof.tick("out") b_seq_len = len_until_nones(metadata[b]) # TODO: Generalize this # Slice the data according to the sequence length b_metadata = metadata[b][:b_seq_len] b_images = images[b][:b_seq_len] b_instructions = instructions[b][:b_seq_len] b_instr_len = instr_lengths[b][:b_seq_len] b_actions = actions[b][:b_seq_len] # ---------------------------------------------------------------------------- self.prof.tick("inputs") actions = self(b_images, b_instructions, b_instr_len) action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False) self.prof.tick("call") action_losses = self.action_loss.batch_reduce_loss(action_losses) action_loss = self.action_loss.reduce_loss(action_losses) action_loss_total = action_loss count += b_seq_len self.prof.tick("loss") action_loss_avg = action_loss_total / (count + 1e-9) self.prof.tick("out") prefix = self.model_name + ("/eval" if eval else "/train") self.writer.add_dict(prefix, get_current_meters(), self.get_iter()) self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu()[0], self.get_iter()) total_loss = action_loss_avg self.inc_iter() self.prof.loop() self.prof.print_stats(1) return total_loss def get_dataset(self, data=None, envs=None, dataset_name=None, eval=False): # TODO: Maybe use eval here #if self.fpv: return SegmentDataset(data=data, env_list=envs, dataset_name=dataset_name, aux_provider_names=[], segment_level=True)
class LeakyIntegratorGlobalMap(MapTransformerBase): def __init__(self, source_map_size, world_size_px, world_size_m, lamda=0.2): super(LeakyIntegratorGlobalMap, self).__init__(source_map_size, world_size_px, world_size_m) self.map_size_px = source_map_size self.world_size_px = world_size_px self.world_size_m = world_size_m self.child_transformer = MapTransformerBase(source_map_size, world_size_px, world_size_m) self.lamda = lamda self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.map_memory = [] self.coverage_memory = [] self.dbg_t = None self.seq = 0 def init_weights(self): pass def reset(self): super(LeakyIntegratorGlobalMap, self).reset() self.map_memory = [] self.coverage_memory = [] self.child_transformer.reset() self.seq = 0 def cuda(self, device=None): MapTransformerBase.cuda(self, device) self.child_transformer.cuda(device) return self def dbg_write_extra(self, map, pose): if DebugWriter().should_write(): map = map[0:1, 0:3] self.seq += 1 # Initialize a transformer module if pose is not None: if self.dbg_t is None: self.dbg_t = MapTransformerBase( self.map_size_px, self.world_size_px, self.world_size_m).to(map.device) # Transform the prediction to the global frame and write out to disk. self.dbg_t.set_map(map, pose) map_global, _ = self.dbg_t.get_map(None) else: map_global = map DebugWriter().write_img(map_global[0], "gif_overlaid", args={ "world_size": self.world_size_px, "name": "sm" }) def forward(self, images_w, coverages_w, add_mask=None, reset_mask=None, show=False): #show="li" self.prof.tick(".") batch_size = len(images_w) assert add_mask is None or add_mask[ 0] is not None, "The first observation in a sequence needs to be used!" masked_observations_w_add = self.lamda * images_w * coverages_w all_maps_out_w = [] all_coverages_out_w = [] self.prof.tick("maps_to_global") # TODO: Draw past trajectory on an extra channel of the semantic map # Step 2: Integrate serially in the global frame for i in range(batch_size): if len(self.map_memory) == 0 or (reset_mask is not None and reset_mask[i]): new_map_w = images_w[i:i + 1] new_map_cov_w = coverages_w[i:i + 1] # Allow masking of observations elif add_mask is None or add_mask[i]: # Get the current global-frame map map_g = self.map_memory[-1] map_cov_g = self.coverage_memory[-1] cov_w = coverages_w[i:i + 1] obs_cov_g = masked_observations_w_add[i:i + 1] # Add the observation into the map using a leaky integrator rule (TODO: Output lamda from model) new_map_cov_w = torch.clamp(map_cov_g + cov_w, 0, 1) new_map_w = (1 - self.lamda ) * map_g + obs_cov_g + self.lamda * map_g * ( 1 - cov_w) else: new_map_w = self.map_memory[-1] new_map_cov_w = self.coverage_memory[-1] self.map_memory.append(new_map_w) self.coverage_memory.append(new_map_cov_w) all_maps_out_w.append(new_map_w) all_coverages_out_w.append(new_map_cov_w) #Presenter().show_image(new_map_cov_w.data[0, 0:3], "map_cov", torch=True, scale=8, waitkey=1) if show != "": Presenter().show_image(new_map_cov_w.data[0, 0:3], show, torch=True, scale=8, waitkey=1) self.prof.tick("integrate") # Step 3: Convert all maps to local frame all_maps_w = torch.cat(all_maps_out_w, dim=0) all_coverages_out_w = torch.cat(all_coverages_out_w, dim=0) # Write gifs for debugging #self.dbg_write_extra(all_maps_w, None) self.prof.tick("maps_to_local") self.prof.loop() self.prof.print_stats(10) return all_maps_w, all_coverages_out_w