def __init__(self, run_name="", ang_weight=0.33, fwd_weight=0.33, stop_weight=0.33): super(CrossEntropy2d, self).__init__() self.softmax = SpatialSoftmax2d() self.logsoftmax = SpatialSoftmax2d(log=True)
def __init__(self, run_name="", model_instance_name="only"): super(PVN_Stage2_Keyboard, self).__init__() self.model_name = "pvn_stage2" self.run_name = run_name self.instance_name = model_instance_name self.writer = LoggingSummaryWriter(log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.instance_name}") self.params_s1 = get_current_parameters()["ModelPVN"]["Stage1"] self.params = get_current_parameters()["ModelPVN"]["Stage2"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.spatialsoftmax = SpatialSoftmax2d() self.action_loss = ActionLoss() self.teleoper = KeyTeleop() self.env_id = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.get_act_start_pose = None self.gt_labels = None
def __init__(self, run_name=""): super(ModelTrajectoryToAction, self).__init__() self.model_name = "lsvd_action" self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["ModelPVN"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Common # -------------------------------------------------------------------------------------------------------------- self.map_transform_w_to_s = MapTransformerBase( source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) self.map_transform_r_to_w = MapTransformerBase( source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) # Output an action given the global semantic map if self.params["map_to_action"] == "downsample2": self.map_to_action = EgoMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"]) elif self.params["map_to_action"] == "cropped": self.map_to_action = CroppedMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], manual=self.params["manual_rule"], path_only=self.params["action_in_path_only"], recurrence=self.params["action_recurrence"]) self.spatialsoftmax = SpatialSoftmax2d() self.gt_fill_missing = MapBatchFillMissing( self.params["local_map_size"], self.params["world_size_px"]) # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training # and finetuned on all timesteps end-to-end enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True) self.action_loss = ActionLoss() self.env_id = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.get_act_start_pose = None self.gt_labels = None
def __init__(self, lingunet_params, prior_channels_in, posterior_channels_in, dual_head=False, compute_prior=True, use_prior=False, oob=False): super(RatioPathPredictor, self).__init__() self.use_prior = use_prior self.prior_img_channels = prior_channels_in self.posterior_img_channels = posterior_channels_in self.dual_head = dual_head self.small_network = lingunet_params.get("small_network") self.oob = oob if use_prior: assert compute_prior, "If we want to use the prior distribution, we should compute it, right?" if self.oob: lingunet_params["in_channels"] = posterior_channels_in self.unet_posterior = Lingunet5OOB(deepcopy(lingunet_params)) lingunet_params["in_channels"] = prior_channels_in self.unet_prior = Lingunet5OOB(deepcopy(lingunet_params)) elif self.small_network: lingunet_params["in_channels"] = posterior_channels_in self.unet_posterior = Lingunet5S(deepcopy(lingunet_params)) lingunet_params["in_channels"] = prior_channels_in self.unet_prior = Lingunet5S(deepcopy(lingunet_params)) elif self.dual_head: lingunet_params["in_channels"] = posterior_channels_in self.unet_posterior = Lingunet5DualHead(deepcopy(lingunet_params)) lingunet_params["in_channels"] = prior_channels_in self.unet_prior = Lingunet5DualHead(deepcopy(lingunet_params)) else: lingunet_params["in_channels"] = posterior_channels_in self.unet_posterior = Lingunet5(deepcopy(lingunet_params)) lingunet_params["in_channels"] = prior_channels_in self.unet_prior = Lingunet5(deepcopy(lingunet_params)) self.softmax = SpatialSoftmax2d() self.norm = nn.InstanceNorm2d(2) self.compute_prior = compute_prior self.dbg_t = None
def __init__(self, prior_channels=32, posterior_channels=32, pred_channels=2, emb_size=120, source_map_size=32, world_size=32, compute_prior=True, use_prior=False, l2=False): super(RatioPathPredictor, self).__init__(source_map_size, world_size) self.prior_img_channels = prior_channels self.posterior_img_channels = posterior_channels self.emb_size = emb_size self.l2 = l2 self.use_prior = use_prior if use_prior: assert compute_prior, "If we want to use the prior distribution, we should compute it, right?" self.unet_posterior = Unet5ContextualBneck( posterior_channels, pred_channels, emb_size, hc1=48, hb1=24, hc2=128) self.unet_prior = Unet5ContextualBneck( prior_channels, pred_channels, 1, hc1=48, hb1=24, hc2=128) self.softmax = SpatialSoftmax2d() self.norm = nn.InstanceNorm2d(2) self.compute_prior = compute_prior #self.map_filter = MapLangSemanticFilter(emb_size, feature_channels, 3) self.map_size = source_map_size self.world_size = world_size self.dbg_t = None self.seq = 0
def __init__(self, run_name, ignore_lang=False, class_loss=True, ground_loss=True): super(ModelTopDownPathGoalPredictor, self).__init__() self.run_name = run_name self.model_name = "top_down_path_pred_pretrain" self.writer = SummaryWriter(log_dir="runs/" + run_name) self.ignore_lang = ignore_lang self.class_loss = class_loss self.ground_loss = ground_loss # The feature net extracts the 2D feature map from the input image. # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image # The output predicted labels are the size of the feature map self.feature_net = ResNet13Light(32, down_pad=True) self.label_pool = nn.MaxPool2d(8) if self.ground_loss: self.lang_filter = MapLangSemanticFilter(sentence_embedding_size, 32, 3) self.aux_ground_linear = nn.Linear(3, 2) enable_weight_saving(self.lang_filter, "ground_filter") enable_weight_saving(self.aux_ground_linear, "ground_aux_linear") if RESNET: self.unet = ResNetConditional(sentence_embedding_size, 35, 2) else: unet_c_in = 35 if self.ground_loss else 32 unet_hc1 = 48 if self.ground_loss else 48 unet_hb1 = 24 if self.ground_loss else 24 self.unet = Unet5ContextualBneck(unet_c_in, 2, sentence_embedding_size, hc1=unet_hc1, hb1=unet_hb1, hc2=128, split_embedding=splitemb) if attention: self.sentence_embedding = SentenceEmbeddingSelfAttention( word_embedding_size, lstm_size, sentence_embedding_layers, attention_heads=attention_heads) else: self.sentence_embedding = SentenceEmbeddingSimple( word_embedding_size, sentence_embedding_size, sentence_embedding_layers) self.gather2d = Gather2D() if self.class_loss: self.aux_class_linear = nn.Linear(32, 64) enable_weight_saving(self.aux_class_linear, "class_aux_linear") print("Sentence Embedding #Params: ", get_n_params(self.sentence_embedding)) print("U-Net #Params: ", get_n_params(self.unet)) print("Class auxiliary: ", self.class_loss) print("Ground auxiliary: ", self.ground_loss) # Enable saving of pre-trained weights enable_weight_saving(self.feature_net, "feature_resnet_light") enable_weight_saving(self.unet, "unet") enable_weight_saving(self.sentence_embedding, "sentence_embedding") if NLL: #self.mask_loss = nn.BCELoss() self.mask_loss = nn.NLLLoss2d() elif BCE: self.mask_loss = nn.BCEWithLogitsLoss() elif CE: self.spatialsoftmax = SpatialSoftmax2d() self.mask_loss = CrossEntropy2d() else: self.mask_loss = nn.MSELoss() self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True) self.epoch_numbers = {"train": 0, "eval": 0} self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.dropout = nn.Dropout(0.5) self.dropout2d = nn.Dropout2d(0.5) self.dropout3d = nn.Dropout3d(0.5) self.viz_images = [] self.instructions = []
def __init__(self, run_name="", model_instance_name="only"): super(PVN_Stage2_ActorCritic, self).__init__() self.model_name = "pvn_stage2_ac" self.run_name = run_name self.instance_name = model_instance_name self.writer = LoggingSummaryWriter(log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.instance_name}") self.params_s1 = get_current_parameters()["ModelPVN"]["Stage1"] self.params_s2 = get_current_parameters()["ModelPVN"]["Stage2"] self.params = get_current_parameters()["ModelPVN"]["ActorCritic"] self.oob = self.params_s1.get("clip_observability") self.ignore_struct = self.params.get("ignore_structured_input", False) self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.tensor_store = KeyTensorStore() # Common # -------------------------------------------------------------------------------------------------------------- # Wrap the Stage1 model so that it becomes invisible to PyTorch, get_parameters etc, and doesn't get optimized self.map_transform_w_to_r = MapTransformer(source_map_size=self.params_s1["global_map_size"], dest_map_size=self.params_s1["local_map_size"], world_size_px=self.params_s1["world_size_px"], world_size_m=self.params_s1["world_size_m"]) self.action_base = PVN_Stage2_RLBase(map_channels=self.params_s2["map_to_act_channels"], map_struct_channels=self.params_s2["map_structure_channels"], crop_size=self.params_s2["crop_size"], map_size=self.params_s1["local_map_size"], h1=self.params["h1"], h2=self.params["h2"], structure_h1=self.params["structure_h1"], obs_dim=self.params["obs_dim"], name="action") self.value_base = PVN_Stage2_RLBase(map_channels=self.params_s2["map_to_act_channels"], map_struct_channels=self.params_s2["map_structure_channels"], crop_size=self.params_s2["crop_size"], map_size=self.params_s1["local_map_size"], h1=self.params["h1"], h2=self.params["h2"], structure_h1=self.params["structure_h1"], obs_dim=self.params["obs_dim"], name="value") self.action_head = PVN_Stage2_ActionHead(h2=self.params["h2"]) self.value_head = PVN_Stage2_ValueHead(h2=self.params["h2"]) self.spatialsoftmax = SpatialSoftmax2d() self.batch_select = MapBatchSelect() self.gt_fill_missing = MapBatchFillMissing( self.params_s1["local_map_size"], self.params_s1["world_size_px"], self.params_s1["world_size_m"]) self.env_id = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.get_act_start_pose = None self.gt_labels = None # PPO interface: self.is_recurrent = False
def __init__(self, run_name="", domain="sim"): super(PVN_Stage1_Bidomain_Original, self).__init__() self.model_name = "pvn_stage1" self.run_name = run_name self.domain = domain self.writer = LoggingSummaryWriter( log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}") #self.writer = DummySummaryWriter() self.root_params = get_current_parameters()["ModelPVN"] self.params = self.root_params["Stage1"] self.use_aux = self.root_params["UseAux"] self.aux_weights = self.root_params["AuxWeights"] if self.params.get("weight_override"): aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride" aux_weights_override = self.root_params.get( aux_weights_override_name) if aux_weights_override: print( f"Overriding auxiliary weights for domain: {self.domain}") self.aux_weights = dict_merge(self.aux_weights, aux_weights_override) self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.tensor_store = KeyTensorStore() self.losses = AuxiliaryLosses() # Auxiliary Objectives self.do_perturb_maps = self.params["perturb_maps"] print("Perturbing maps: ", self.do_perturb_maps) # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.num_feature_channels = self.params[ "feature_channels"] # + params["relevance_channels"] self.num_map_channels = self.params["pathpred_in_channels"] self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], cam_h_fov=self.params["cam_h_fov"], domain=domain, img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.add_init_pos_to_coverage = AddDroneInitPosToCoverage( world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"], map_size_px=self.params["local_map_size"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame self.map_processor_grounding = LangFilterMapProcessor( embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=False) ratio_prior_channels = self.params["feature_channels"] # Process the global accumulated map self.path_predictor_lingunet = RatioPathPredictor( self.params["lingunet"], prior_channels_in=self.params["feature_channels"], posterior_channels_in=self.params["pathpred_in_channels"], dual_head=self.params["predict_confidence"], compute_prior=self.params["compute_prior"], use_prior=self.params["use_prior_only"], oob=self.params["clip_observability"]) print("UNet Channels: " + str(self.num_map_channels)) print("Feature Channels: " + str(self.num_feature_channels)) # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc) self.second_transform = self.do_perturb_maps or self.params[ "predict_in_start_frame"] # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"], self.params["emb_dropout"]) self.map_transform_local_to_local = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_global_to_local = MapTransformer( source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_local_to_global = MapTransformer( source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size_m=self.params["world_size_m"]) self.map_transform_s_to_p = self.map_transform_local_to_local self.map_transform_w_to_s = self.map_transform_global_to_local self.map_transform_w_to_r = self.map_transform_global_to_local self.map_transform_r_to_s = self.map_transform_local_to_local self.map_transform_r_to_w = self.map_transform_local_to_global self.map_transform_p_to_w = self.map_transform_local_to_global self.map_transform_p_to_r = self.map_transform_local_to_local # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing( self.params["local_map_size"], self.params["world_size_px"], self.params["world_size_m"]) self.spatialsoftmax = SpatialSoftmax2d() self.visitation_softmax = VisitationSoftmax() #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2 # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if self.use_aux["class_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_features", self.params["feature_channels"], self.params["num_landmarks"], 0, "fpv_features", "lm_pos_fpv", "lm_indices")) if self.use_aux["grounding_features"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_features", self.params["relevance_channels"], 2, 0, "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if self.use_aux["class_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("class_map", self.params["feature_channels"], self.params["num_landmarks"], 0, "S_W_select", "lm_pos_map_select", "lm_indices_select")) if self.use_aux["grounding_map"]: self.losses.add_auxiliary( ClassAuxiliary2D("grounding_map", self.params["relevance_channels"], 2, 0, "R_W_select", "lm_pos_map_select", "lm_mentioned_select")) # CoRL model uses alignment-model groundings if self.use_aux["lang"]: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.losses.add_auxiliary( ClassAuxiliary("lang", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux["regularize_map"]: self.losses.add_auxiliary( FeatureRegularizationAuxiliary2D("regularize_map", "l1", "S_W_select")) lossfunc = self.params["path_loss_function"] if self.params["clip_observability"]: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) else: self.losses.add_auxiliary( PathAuxiliary2D("visitation_dist", lossfunc, self.params["clip_observability"], "log_v_dist_s_select", "v_dist_s_ground_truth_select", "SM_S_select")) self.goal_good_criterion = GoalPredictionGoodCriterion( ok_distance=self.params["world_size_px"] * 0.1) self.goal_acc_meter = MovingAverageMeter(10) self.visible_goal_acc_meter = MovingAverageMeter(10) self.invisible_goal_acc_meter = MovingAverageMeter(10) self.visible_goal_frac_meter = MovingAverageMeter(10) self.losses.print_auxiliary_info() self.total_goals = 0 self.correct_goals = 0 self.env_id = None self.env_img = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.should_save_path_overlays = False
def forward(self, images, states, instructions, instr_lengths, plan=None, noisy_start_poses=None, start_poses=None, firstseg=None, select_only=True, halfway=False, grad_noise=False, rl=False): """ :param images: BxCxHxW batch of images (observations) :param states: BxK batch of drone states :param instructions: BxM LongTensor where M is the maximum length of any instruction :param instr_lengths: list of len B of integers, indicating length of each instruction :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time :param start_poses: list of drone start poses (these should be equal in practice) :param firstseg: list of booleans indicating True if a new segment starts at that timestep :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True) :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution :return: """ cam_poses = self.cam_poses_from_states(states) g_poses = None # None pose is a placeholder for the canonical global pose. self.prof.tick("out") self.tensor_store.keep_inputs("fpv", images) # Calculate the instruction embedding if instructions is not None: # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state # TODO: There's an assumption here that there's only a single instruction in the batch and it doesn't change # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP if self.params["ignore_instruction"]: # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token sent_embeddings = self.sentence_embedding( torch.zeros_like(instructions[0:1, 0:1]), torch.ones_like(instr_lengths[0:1])) else: sent_embeddings = self.sentence_embedding( instructions[0:1], instr_lengths[0:1]) self.tensor_store.keep_inputs("sentence_embed", sent_embeddings) else: sent_embeddings = self.sentence_embedding.get() self.prof.tick("embed") # Extract and project features onto the egocentric frame for each image F_W, M_W = self.img_to_features_w(images, cam_poses, sent_embeddings, self.tensor_store, show="", halfway=halfway) # For training the critic, this is as far as we need to poceed with the computation. # self.img_to_features_w has stored computed feature maps inside the tensor store, which will then be retrieved by the critic if halfway == True: # Warning: halfway must be True not truthy return None, None self.tensor_store.keep_inputs("F_w", F_W) self.tensor_store.keep_inputs("M_w", M_W) self.prof.tick("img_to_map_frame") # Accumulate the egocentric features in a global map reset_mask = firstseg if self.params["clear_history"] else None # Consider the space very near the drone and right under it as observed - draw ones on the observability mask # If we treat that space as unobserved, then there's going to be a gap in the visitation distribution, which # makes training with RL more difficult, as there is no reward feedback if the drone doesn't cross that gap. if self.params.get("cover_init_pos", False): StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks( M_W.shape[0], M_W.device) StartMasks_W, _ = self.map_transform_r_to_w( StartMasks_R, cam_poses, None) M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W) S_W, SM_W = self.map_accumulator_w(F_W, M_W, reset_mask=reset_mask, show="acc" if IMG_DBG else "") S_W_poses = g_poses self.prof.tick("map_accumulate") # If we're training Stage 2 with imitation learning from ground truth visitation distributions, we want to # compute observability masks with the same code that's used in Stage 1 to avoid mistakes. if halfway == "observability": map_uncoverage_w = 1 - SM_W return map_uncoverage_w # Throw away those timesteps that don't correspond to planning timesteps S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \ self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan) #maps_m_prior_select, maps_m_posterior_select = None, None # Only process the maps on plannieng timesteps if len(S_W_select) == 0: return None self.tensor_store.keep_inputs("S_W_select", S_W_select) self.prof.tick("batch_select") # Process the map via the two map_procesors # Do grounding of objects in the map chosen to do so if self.use_aux["grounding_map"]: R_W_select, RS_W_poses_select = self.map_processor_grounding( S_W_select, sent_embeddings_select, S_W_poses_select, show="") self.tensor_store.keep_inputs("R_W_select", R_W_select) self.prof.tick("map_proc_gnd") # Concatenate grounding map and semantic map along channel dimension RS_W_select = torch.cat([S_W_select, R_W_select], 1) else: RS_W_select = S_W_select RS_W_poses_select = S_W_poses_select s_poses_select = start_poses_select if self.params[ "predict_in_start_frame"] else cam_poses_select RS_S_select, RS_S_poses_select = self.map_transform_w_to_s( RS_W_select, RS_W_poses_select, s_poses_select) SM_S_select, SM_S_poses_select = self.map_transform_w_to_s( SM_W_select, S_W_poses_select, s_poses_select) assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame" self.tensor_store.keep_inputs("RS_S_select", RS_S_select) self.tensor_store.keep_inputs("SM_S_select", SM_S_select) self.prof.tick("transform_w_to_s") # Data augmentation for trajectory prediction map_poses_clean_select = None # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses if self.do_perturb_maps: assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps" RS_P_select, RS_P_poses_select = self.map_transform_s_to_p( RS_S_select, RS_S_poses_select, noisy_start_poses_select) else: RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select) self.prof.tick("map_perturb") sent_embeddings_pp = sent_embeddings_select # Run lingunet on the map to predict visitation distribution scores (pre-softmax) # --------- log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet( RS_P_select, sent_embeddings_pp, RS_P_poses_select, tensor_store=self.tensor_store) # --------- self.prof.tick("pathpred") # TODO: Shouldn't we be transforming probability distributions instead of scores? Otherwise OOB space will have weird values # Transform distributions back to world reference frame and keep them (these are the model outputs) both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, None) log_v_dist_w_select = Partial2DDistribution( both_inner_w, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_w_select", log_v_dist_w_select) # Transform distributions back to start reference frame and keep them (for auxiliary objective) both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, start_poses_select) log_v_dist_s_select = Partial2DDistribution( both_inner_s, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_s_select", log_v_dist_s_select) # prime number will mean that it will alternate between sim and real if self.get_iter() % 23 == 0: lsfm = SpatialSoftmax2d() for i in range(S_W_select.shape[0]): Presenter().show_image(S_W_select.detach().cpu()[i, 0:3], f"{self.domain}_s_w_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_s_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_s_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_p_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_p_select", scale=4, waitkey=1) Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3], f"{self.domain}_rs_p_select", scale=4, waitkey=1) break self.prof.tick("transform_back") # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep # that had a trajectory prediction, rotated in the frame of timestep k. if select_only: # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps log_v_dist_w = log_v_dist_w_select v_dist_w_poses = v_dist_w_poses_select else: raise NotImplementedError("select_only must be True") return_list = [log_v_dist_w, v_dist_w_poses] if rl: internals_for_rl = { "map_coverage_w": SM_W, "map_uncoverage_w": 1 - SM_W } return_list.append(internals_for_rl) return tuple(return_list)
def forward(self, images, states, instructions, instr_lengths, plan=None, noisy_start_poses=None, start_poses=None, firstseg=None, select_only=True, halfway=False, grad_noise=False, rl=False, noshow=False): """ :param images: BxCxHxW batch of images (observations) :param states: BxK batch of drone states :param instructions: BxM LongTensor where M is the maximum length of any instruction :param instr_lengths: list of len B of integers, indicating length of each instruction :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time :param start_poses: list of drone start poses (these should be equal in practice) :param firstseg: list of booleans indicating True if a new segment starts at that timestep :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True) :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution :return: """ cam_poses = self.cam_poses_from_states(states) g_poses = None # None pose is a placeholder for the canonical global pose. self.prof.tick("out") # If we're running the blind baseline, simply replace the images with zeros. Keep all other computation. if self.params.get("blind"): images = torch.zeros_like(images) self.tensor_store.keep_inputs("fpv", images) # Calculate the instruction embedding if instructions is not None: # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state # TODO: handle this # ASSUMING IT'S THE SAME INSTRUCTION SEGMENT (PREDICT ONE SEGMENT AT A TIME). # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP if self.params["ignore_instruction"]: # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token sent_embeddings = self.sentence_embedding( torch.zeros_like(instructions[0:1, 0:1]), torch.ones_like(torch.tensor(instr_lengths[0:1]))) else: sent_embeddings = self.sentence_embedding( instructions[0:1], instr_lengths[0:1]) #sent_embeddings = self.sentence_embedding(instructions, instr_lengths) self.tensor_store.keep_inputs("sentence_embed", sent_embeddings) else: sent_embeddings = self.sentence_embedding.get() self.prof.tick("embed") # Extract and project features onto the egocentric frame for each image F_W, M_W = self.img_to_features_w(images, cam_poses, sent_embeddings, self.tensor_store, show="", halfway=halfway) if halfway == True and not halfway == "v2": return None, None self.tensor_store.keep_inputs("F_w", F_W) self.tensor_store.keep_inputs("M_w", M_W) self.prof.tick("img_to_map_frame") # Accumulate the egocentric features in a global map reset_mask = firstseg if self.params["clear_history"] else None # Consider the space very near the drone as observable - it is too hard to explore to consider it unobservable. if self.params.get("cover_init_pos", False): StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks( M_W.shape[0], M_W.device) StartMasks_W, _ = self.map_transform_r_to_w( StartMasks_R, cam_poses, None) M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W) S_W, SM_W = self.map_accumulator_w(F_W, M_W, reset_mask=reset_mask, show="acc" if IMG_DBG else "") S_W_poses = g_poses self.tensor_store.keep_inputs("SM_w", SM_W) self.prof.tick("map_accumulate") if halfway == "observability": map_uncoverage_w = 1 - SM_W return map_uncoverage_w # Throw away those timesteps that don't correspond to planning timesteps S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \ self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan) #maps_m_prior_select, maps_m_posterior_select = None, None # Only process the maps on plannieng timesteps if len(S_W_select) == 0: return None self.tensor_store.keep_inputs("S_W_select", S_W_select) self.prof.tick("batch_select") # Create a figure where the drone is drawn on the map if self.params["write_figures"]: self.tensor_store.keep_inputs( "drone_poses", Variable(draw_drone_poses(cam_poses_select))) # Process the map via the two map_procesors # Do grounding of objects in the map chosen to do so if self.use_aux["grounding_map"]: R_W_select, RS_W_poses_select = self.map_processor_grounding( S_W_select, sent_embeddings_select, S_W_poses_select, show="") self.tensor_store.keep_inputs("R_W_select", R_W_select) self.prof.tick("map_proc_gnd") # Concatenate grounding map and semantic map along channel dimension RS_W_select = torch.cat([S_W_select, R_W_select], 1) else: RS_W_select = S_W_select RS_W_poses_select = S_W_poses_select s_poses_select = start_poses_select if self.params[ "predict_in_start_frame"] else cam_poses_select RS_S_select, RS_S_poses_select = self.map_transform_w_to_s( RS_W_select, RS_W_poses_select, s_poses_select) SM_S_select, SM_S_poses_select = self.map_transform_w_to_s( SM_W_select, S_W_poses_select, s_poses_select) assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame" self.tensor_store.keep_inputs("RS_S_select", RS_S_select) self.tensor_store.keep_inputs("SM_S_select", SM_S_select) self.prof.tick("transform_w_to_s") # Data augmentation for trajectory prediction map_poses_clean_select = None # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses if self.do_perturb_maps: assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps" RS_P_select, RS_P_poses_select = self.map_transform_s_to_p( RS_S_select, RS_S_poses_select, noisy_start_poses_select) else: RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select # TODO: Figure out why this is unused! self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select) self.prof.tick("map_perturb") sent_embeddings_pp = sent_embeddings_select # Process the map via the two map_procesors (e.g. predict the trajectory that we'll be taking) # --------- log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet( RS_P_select, sent_embeddings_pp, RS_P_poses_select, tensor_store=self.tensor_store) # --------- self.prof.tick("pathpred") # Transform distributions back to world reference frame and keep them (these are the model outputs) both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, None) log_v_dist_w_select = Partial2DDistribution( both_inner_w, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_w_select", log_v_dist_w_select) # Transform distributions back to start reference frame and keep them (for auxiliary objective) both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r( log_v_dist_p_select.inner_distribution, v_dist_p_poses_select, start_poses_select) log_v_dist_s_select = Partial2DDistribution( both_inner_s, log_v_dist_p_select.outer_prob_mass) self.tensor_store.keep_inputs("log_v_dist_s_select", log_v_dist_s_select) lsfm = SpatialSoftmax2d() # prime number will mean that it will alternate between sim and real if self.get_iter() % 23 == 0 and not noshow: for i in range(S_W_select.shape[0]): Presenter().show_image(S_W_select.detach().cpu()[i, 0:3], f"{self.domain}_s_w_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_s_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_s_select", scale=4, waitkey=1) Presenter().show_image(lsfm( log_v_dist_p_select.inner_distribution).detach().cpu()[i], f"{self.domain}_v_dist_p_select", scale=4, waitkey=1) Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3], f"{self.domain}_rs_p_select", scale=4, waitkey=1) break self.prof.tick("transform_back") # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep # that had a trajectory prediction, rotated in the frame of timestep k. if select_only: # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps log_v_dist_w = log_v_dist_w_select v_dist_w_poses = v_dist_w_poses_select cam_poses = cam_poses_select sent_embeddings = sent_embeddings_select else: log_v_dist_w, v_dist_w_poses, log_goal_oob_score = self.map_batch_fill_missing( log_v_dist_w_select, v_dist_w_poses_select, plan, show="") # TODO: Fix this self.tensor_store.keep_inputs("log_both_dists_w", log_both_dists_w) self.prof.tick("map_fill_missing") return_list = [log_v_dist_w, v_dist_w_poses] if rl: internals_for_rl = { "map_coverage_w": SM_W, "map_uncoverage_w": 1 - SM_W } return_list.append(internals_for_rl) return tuple(return_list)