Exemplo n.º 1
0
class DrawStartPosOnGlobalMap(MapTransformerBase):

    def __init__(self, source_map_size, world_in_map_size, lamda=0.2):
        super(DrawStartPosOnGlobalMap, self).__init__(source_map_size, world_in_map_size)
        self.map_size = source_map_size
        self.world_size = world_in_map_size
        self.child_transformer = MapTransformerBase(source_map_size, world_in_map_size)

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.start_pose = None
        self.last_emb = None

        self.dbg_t = None
        self.seq = 0

    def init_weights(self):
        pass

    def reset(self):
        super(DrawStartPosOnGlobalMap, self).reset()
        self.start_pose = None
        self.last_emb = None
        self.child_transformer.reset()
        self.seq = 0

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.child_transformer.cuda(device)
        return self

    def get_start_poses(self, cam_poses_w, sentence_embeddings):
        # For each timestep, get the pose corresponding to the start of the instruction segment
        seq_len = len(sentence_embeddings)
        start_poses = []
        for i in range(seq_len):
            if self.last_emb is not None and (sentence_embeddings[i].data == self.last_emb).all():
                pass # Keep the same start pose since we're on the same segment
            else:
                self.last_emb = sentence_embeddings[i].data
                self.start_pose = cam_poses_w[i]
            start_poses.append(self.start_pose)
        return start_poses

    def forward(self, maps_w, sentence_embeddings, map_poses_w, cam_poses_w, show=False):
        #show="li
        self.prof.tick(".")
        batch_size = len(maps_w)

        # Initialize the layers of the same size as the maps, but with only one channel
        new_layer_size = list(maps_w.size())
        new_layer_size[1] = 1
        all_maps_out_w = empty_float_tensor(new_layer_size, self.is_cuda, self.cuda_device)

        start_poses = self.get_start_poses(cam_poses_w, sentence_embeddings)

        poses_img = [poses_as_to_img(as_pose, self.world_size) for as_pose in start_poses]
        #poses_img = poses_as_to_img(start_poses, self.world_size, batch_dim=True)

        for i in range(batch_size):
            x = min(max(int(poses_img[i].position.data[0]), 0), new_layer_size[2] - 1)
            y = min(max(int(poses_img[i].position.data[1]), 0), new_layer_size[2] - 1)
            all_maps_out_w[i, 0, x, y] = 10.0

        if show != "":
            Presenter().show_image(all_maps_out_w[0], show, torch=True, waitkey=1)

        self.prof.tick("draw")

        # Step 3: Convert all maps to local frame
        maps_out = torch.cat([Variable(all_maps_out_w), maps_w], dim=1)
        #all_maps_w = torch.cat(all_maps_out_w, dim=0)

        self.prof.loop()
        self.prof.print_stats(10)

        return maps_out, map_poses_w
Exemplo n.º 2
0
    def __init__(self, run_name="", domain="sim"):

        super(PVN_Stage1_Bidomain_Original, self).__init__()
        self.model_name = "pvn_stage1"
        self.run_name = run_name
        self.domain = domain
        self.writer = LoggingSummaryWriter(
            log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}")
        #self.writer = DummySummaryWriter()

        self.root_params = get_current_parameters()["ModelPVN"]
        self.params = self.root_params["Stage1"]
        self.use_aux = self.root_params["UseAux"]
        self.aux_weights = self.root_params["AuxWeights"]

        if self.params.get("weight_override"):
            aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride"
            aux_weights_override = self.root_params.get(
                aux_weights_override_name)
            if aux_weights_override:
                print(
                    f"Overriding auxiliary weights for domain: {self.domain}")
                self.aux_weights = dict_merge(self.aux_weights,
                                              aux_weights_override)

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.tensor_store = KeyTensorStore()
        self.losses = AuxiliaryLosses()

        # Auxiliary Objectives
        self.do_perturb_maps = self.params["perturb_maps"]
        print("Perturbing maps: ", self.do_perturb_maps)

        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.num_feature_channels = self.params[
            "feature_channels"]  # + params["relevance_channels"]
        self.num_map_channels = self.params["pathpred_in_channels"]

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"],
            map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"],
            img_h=self.params["img_h"],
            cam_h_fov=self.params["cam_h_fov"],
            domain=domain,
            img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.add_init_pos_to_coverage = AddDroneInitPosToCoverage(
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            map_size_px=self.params["local_map_size"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        self.map_processor_grounding = LangFilterMapProcessor(
            embed_size=self.params["emb_size"],
            in_channels=self.params["feature_channels"],
            out_channels=self.params["relevance_channels"],
            spatial=False,
            cat_out=False)

        ratio_prior_channels = self.params["feature_channels"]

        # Process the global accumulated map
        self.path_predictor_lingunet = RatioPathPredictor(
            self.params["lingunet"],
            prior_channels_in=self.params["feature_channels"],
            posterior_channels_in=self.params["pathpred_in_channels"],
            dual_head=self.params["predict_confidence"],
            compute_prior=self.params["compute_prior"],
            use_prior=self.params["use_prior_only"],
            oob=self.params["clip_observability"])

        print("UNet Channels: " + str(self.num_map_channels))
        print("Feature Channels: " + str(self.num_feature_channels))

        # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc)
        self.second_transform = self.do_perturb_maps or self.params[
            "predict_in_start_frame"]

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"],
            self.params["emb_layers"], self.params["emb_dropout"])

        self.map_transform_local_to_local = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_global_to_local = MapTransformer(
            source_map_size=self.params["global_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_local_to_global = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_s_to_p = self.map_transform_local_to_local
        self.map_transform_w_to_s = self.map_transform_global_to_local
        self.map_transform_w_to_r = self.map_transform_global_to_local
        self.map_transform_r_to_s = self.map_transform_local_to_local
        self.map_transform_r_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_r = self.map_transform_local_to_local

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(
            self.params["local_map_size"], self.params["world_size_px"],
            self.params["world_size_m"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.visitation_softmax = VisitationSoftmax()

        #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2
        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if self.use_aux["class_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_features",
                                 self.params["feature_channels"],
                                 self.params["num_landmarks"], 0,
                                 "fpv_features", "lm_pos_fpv", "lm_indices"))
        if self.use_aux["grounding_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_features",
                                 self.params["relevance_channels"], 2, 0,
                                 "fpv_features_g", "lm_pos_fpv",
                                 "lm_mentioned"))
        if self.use_aux["class_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_map", self.params["feature_channels"],
                                 self.params["num_landmarks"], 0, "S_W_select",
                                 "lm_pos_map_select", "lm_indices_select"))
        if self.use_aux["grounding_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_map",
                                 self.params["relevance_channels"], 2, 0,
                                 "R_W_select", "lm_pos_map_select",
                                 "lm_mentioned_select"))
        # CoRL model uses alignment-model groundings
        if self.use_aux["lang"]:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.losses.add_auxiliary(
                ClassAuxiliary("lang", self.params["emb_size"], 2,
                               self.params["num_landmarks"], "sentence_embed",
                               "lang_lm_mentioned"))

        if self.use_aux["regularize_map"]:
            self.losses.add_auxiliary(
                FeatureRegularizationAuxiliary2D("regularize_map", "l1",
                                                 "S_W_select"))

        lossfunc = self.params["path_loss_function"]
        if self.params["clip_observability"]:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))
        else:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(
            ok_distance=self.params["world_size_px"] * 0.1)
        self.goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_acc_meter = MovingAverageMeter(10)
        self.invisible_goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_frac_meter = MovingAverageMeter(10)

        self.losses.print_auxiliary_info()

        self.total_goals = 0
        self.correct_goals = 0

        self.env_id = None
        self.env_img = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0

        self.should_save_path_overlays = False
Exemplo n.º 3
0
    def train_epoch(self, train_data=None, train_envs=None, eval=False):
        if eval:
            self.model.eval()
            inference_type = "eval"
            epoch_num = self.train_epoch_num
            self.test_epoch_num += 1
        else:
            self.model.train()
            inference_type = "train"
            epoch_num = self.train_epoch_num
            self.train_epoch_num += 1
        # import pdb; pdb.set_trace()
        # print("train_envs:", train_envs) - prints like 4200 environments
        dataset = self.model.get_dataset(data=train_data,
                                         envs=train_envs,
                                         dataset_name="supervised",
                                         eval=eval)
        print("code2 dataset type:", type(dataset))
        print("dataset:", dataset)
        print("dataset env_list:", dataset.env_list)
        # TODO: Get rid of this:
        if hasattr(dataset, "set_word2token"):
            dataset.set_word2token(self.token2word, self.word2token)

        dataloader = DataLoader(dataset,
                                collate_fn=dataset.collate_fn,
                                batch_size=self.batch_size,
                                shuffle=True,
                                num_workers=0,
                                pin_memory=False,
                                timeout=0,
                                drop_last=False)

        num_samples = len(dataset)
        if num_samples == 0:
            print("DATASET HAS NO DATA!")
            return -1.0

        num_batches = int(
            (num_samples + self.batch_size - 1) / self.batch_size)

        epoch_loss = 0
        count = 0

        prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        prof.tick("out")
        #import pdb;pdb.set_trace()
        #try:
        for batch in dataloader:

            # For debugging batch_size
            # import pdb; pdb.set_trace()
            if batch is None:
                #print("None batch!")
                continue

            prof.tick("batch_load")
            # Zero gradients before each segment and initialize zero segment loss
            self.optim.zero_grad()

            #try:
            if True:

                batch_loss = self.model.sup_loss_on_batch(batch, eval)

                if type(batch_loss) == int:
                    print("Ding")

                prof.tick("forward")

                # Backprop and step
                if not eval:
                    batch_loss.backward()

                    prof.tick("backward")

                    # Changed this slightly from original for testing
                    # This is SLOW! Don't do it often
                    # TODO: Get rid of tensorboard
                    if self.batch_num % 20 == 1:
                        params = self.model.named_parameters()
                        # self.write_grad_summaries(self.writer, params, self.batch_num)
                        self.logger.scalar_summary("loss", batch_loss.item(),
                                                   self.batch_num)
                    self.batch_num += 1
                    self.optim.step()

                    prof.tick("optim")

                # Get losses as floats
                epoch_loss += batch_loss.data[0]
                count += 1

                sys.stdout.write("\r Batch:" + str(count) + " / " +
                                 str(num_batches) + " loss: " +
                                 str(batch_loss.data[0]))
                sys.stdout.flush()

                self.train_segment += 0 if eval else 1
                self.test_segment += 1 if eval else 0

                prof.tick("rep")

            prof.loop()
            prof.print_stats(10)
            #except Exception as e:
            #    print("Exception encountered during batch update")
            #    print(e)

        #except Exception as e:
        #    print("Error during epoch training")
        #    print(e)
        #    return

        if hasattr(self.model, "write_eoe_summaries"):
            self.model.write_eoe_summaries(inference_type, epoch_num)

        print("")
        epoch_loss /= (count + 1e-15)

        if hasattr(self.model, "writer"):
            self.model.writer.add_scalar(
                self.name + "/" + inference_type + "_epoch_loss", epoch_loss,
                epoch_num)

        return epoch_loss
Exemplo n.º 4
0
class PVN_Stage1_Bidomain_Original(nn.Module):
    def __init__(self, run_name="", domain="sim"):

        super(PVN_Stage1_Bidomain_Original, self).__init__()
        self.model_name = "pvn_stage1"
        self.run_name = run_name
        self.domain = domain
        self.writer = LoggingSummaryWriter(
            log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}")
        #self.writer = DummySummaryWriter()

        self.root_params = get_current_parameters()["ModelPVN"]
        self.params = self.root_params["Stage1"]
        self.use_aux = self.root_params["UseAux"]
        self.aux_weights = self.root_params["AuxWeights"]

        if self.params.get("weight_override"):
            aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride"
            aux_weights_override = self.root_params.get(
                aux_weights_override_name)
            if aux_weights_override:
                print(
                    f"Overriding auxiliary weights for domain: {self.domain}")
                self.aux_weights = dict_merge(self.aux_weights,
                                              aux_weights_override)

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.tensor_store = KeyTensorStore()
        self.losses = AuxiliaryLosses()

        # Auxiliary Objectives
        self.do_perturb_maps = self.params["perturb_maps"]
        print("Perturbing maps: ", self.do_perturb_maps)

        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.num_feature_channels = self.params[
            "feature_channels"]  # + params["relevance_channels"]
        self.num_map_channels = self.params["pathpred_in_channels"]

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"],
            map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"],
            img_h=self.params["img_h"],
            cam_h_fov=self.params["cam_h_fov"],
            domain=domain,
            img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.add_init_pos_to_coverage = AddDroneInitPosToCoverage(
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            map_size_px=self.params["local_map_size"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        self.map_processor_grounding = LangFilterMapProcessor(
            embed_size=self.params["emb_size"],
            in_channels=self.params["feature_channels"],
            out_channels=self.params["relevance_channels"],
            spatial=False,
            cat_out=False)

        ratio_prior_channels = self.params["feature_channels"]

        # Process the global accumulated map
        self.path_predictor_lingunet = RatioPathPredictor(
            self.params["lingunet"],
            prior_channels_in=self.params["feature_channels"],
            posterior_channels_in=self.params["pathpred_in_channels"],
            dual_head=self.params["predict_confidence"],
            compute_prior=self.params["compute_prior"],
            use_prior=self.params["use_prior_only"],
            oob=self.params["clip_observability"])

        print("UNet Channels: " + str(self.num_map_channels))
        print("Feature Channels: " + str(self.num_feature_channels))

        # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc)
        self.second_transform = self.do_perturb_maps or self.params[
            "predict_in_start_frame"]

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"],
            self.params["emb_layers"], self.params["emb_dropout"])

        self.map_transform_local_to_local = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_global_to_local = MapTransformer(
            source_map_size=self.params["global_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_local_to_global = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_s_to_p = self.map_transform_local_to_local
        self.map_transform_w_to_s = self.map_transform_global_to_local
        self.map_transform_w_to_r = self.map_transform_global_to_local
        self.map_transform_r_to_s = self.map_transform_local_to_local
        self.map_transform_r_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_r = self.map_transform_local_to_local

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(
            self.params["local_map_size"], self.params["world_size_px"],
            self.params["world_size_m"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.visitation_softmax = VisitationSoftmax()

        #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2
        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if self.use_aux["class_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_features",
                                 self.params["feature_channels"],
                                 self.params["num_landmarks"], 0,
                                 "fpv_features", "lm_pos_fpv", "lm_indices"))
        if self.use_aux["grounding_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_features",
                                 self.params["relevance_channels"], 2, 0,
                                 "fpv_features_g", "lm_pos_fpv",
                                 "lm_mentioned"))
        if self.use_aux["class_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_map", self.params["feature_channels"],
                                 self.params["num_landmarks"], 0, "S_W_select",
                                 "lm_pos_map_select", "lm_indices_select"))
        if self.use_aux["grounding_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_map",
                                 self.params["relevance_channels"], 2, 0,
                                 "R_W_select", "lm_pos_map_select",
                                 "lm_mentioned_select"))
        # CoRL model uses alignment-model groundings
        if self.use_aux["lang"]:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.losses.add_auxiliary(
                ClassAuxiliary("lang", self.params["emb_size"], 2,
                               self.params["num_landmarks"], "sentence_embed",
                               "lang_lm_mentioned"))

        if self.use_aux["regularize_map"]:
            self.losses.add_auxiliary(
                FeatureRegularizationAuxiliary2D("regularize_map", "l1",
                                                 "S_W_select"))

        lossfunc = self.params["path_loss_function"]
        if self.params["clip_observability"]:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))
        else:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(
            ok_distance=self.params["world_size_px"] * 0.1)
        self.goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_acc_meter = MovingAverageMeter(10)
        self.invisible_goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_frac_meter = MovingAverageMeter(10)

        self.losses.print_auxiliary_info()

        self.total_goals = 0
        self.correct_goals = 0

        self.env_id = None
        self.env_img = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0

        self.should_save_path_overlays = False

    def make_picklable(self):
        self.writer = DummySummaryWriter()

    def steal_cross_domain_modules(self, other_self):
        self.iter = other_self.iter
        self.losses = other_self.losses
        self.sentence_embedding = other_self.sentence_embedding
        self.map_accumulator_w = other_self.map_accumulator_w
        self.map_processor_grounding = other_self.map_processor_grounding
        self.path_predictor_lingunet = other_self.path_predictor_lingunet
        #self.img_to_features_w = other_self.img_to_features_w

    def both_domain_parameters(self, other_self):
        # This function iterates and yields parameters from this module and the other module, but does not yield
        # shared parameters twice.
        # First yield all of the other module's parameters
        for p in other_self.parameters():
            yield p
        # Then yield all the parameters from the this module that are not shared with the other one
        for p in self.img_to_features_w.parameters():
            yield p
        return

    def get_iter(self):
        return int(self.iter.data[0])

    def inc_iter(self):
        self.iter += 1

    def load_state_dict(self, state_dict, strict=True):
        super(PVN_Stage1_Bidomain_Original,
              self).load_state_dict(state_dict, strict)

    def init_weights(self):
        self.img_to_features_w.init_weights()
        self.map_accumulator_w.init_weights()
        self.sentence_embedding.init_weights()
        self.map_processor_grounding.init_weights()
        self.path_predictor_lingunet.init_weights()

    def reset(self):
        # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules
        self.tensor_store.reset()
        self.sentence_embedding.reset()
        self.img_to_features_w.reset()
        self.map_accumulator_w.reset()
        self.map_batch_fill_missing.reset()
        self.prev_instruction = None

    def setEnvContext(self, context):
        print("Set env context to: " + str(context))
        self.env_id = context["env_id"]
        self.env_img = env.load_env_img(self.env_id, 256, 256)
        self.env_img = self.env_img[:, :, [2, 1, 0]]

    def set_save_path_overlays(self, save_path_overlays):
        self.should_save_path_overlays = save_path_overlays

    #TODO:O Figure out what to do with save_ground_truth_overlays

    def print_metrics(self):
        print(f"Model {self.model_name}:{self.domain} metrics:")
        print(
            f"   Goal accuracy: {float(self.correct_goals) / self.total_goals}"
        )

    def goal_visible(self, masks, goal_pos):
        goal_mask = masks.detach()[0, 0, :, :]
        goal_pos = goal_pos[0].long().detach()
        visible = bool(
            (goal_mask[goal_pos[0], goal_pos[1]] > 0.5).detach().cpu().item())
        return visible

    # This is called before beginning an execution sequence
    def start_sequence(self):
        self.seq_step = 0
        self.reset()
        return

    def cam_poses_from_states(self, states):
        cam_pos = states[:, 9:12]
        cam_rot = states[:, 12:16]
        pose = Pose(cam_pos, cam_rot)
        return pose

    def forward(self,
                images,
                states,
                instructions,
                instr_lengths,
                plan=None,
                noisy_start_poses=None,
                start_poses=None,
                firstseg=None,
                select_only=True,
                halfway=False,
                grad_noise=False,
                rl=False):
        """
        :param images: BxCxHxW batch of images (observations)
        :param states: BxK batch of drone states
        :param instructions: BxM LongTensor where M is the maximum length of any instruction
        :param instr_lengths: list of len B of integers, indicating length of each instruction
        :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise
        :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time
        :param start_poses: list of drone start poses (these should be equal in practice)
        :param firstseg: list of booleans indicating True if a new segment starts at that timestep
        :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True)
        :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution
        :return:
        """
        cam_poses = self.cam_poses_from_states(states)
        g_poses = None  # None pose is a placeholder for the canonical global pose.
        self.prof.tick("out")

        self.tensor_store.keep_inputs("fpv", images)

        # Calculate the instruction embedding
        if instructions is not None:
            # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state
            # TODO: There's an assumption here that there's only a single instruction in the batch and it doesn't change
            # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP
            if self.params["ignore_instruction"]:
                # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token
                sent_embeddings = self.sentence_embedding(
                    torch.zeros_like(instructions[0:1, 0:1]),
                    torch.ones_like(instr_lengths[0:1]))
            else:
                sent_embeddings = self.sentence_embedding(
                    instructions[0:1], instr_lengths[0:1])
            self.tensor_store.keep_inputs("sentence_embed", sent_embeddings)
        else:
            sent_embeddings = self.sentence_embedding.get()

        self.prof.tick("embed")

        # Extract and project features onto the egocentric frame for each image
        F_W, M_W = self.img_to_features_w(images,
                                          cam_poses,
                                          sent_embeddings,
                                          self.tensor_store,
                                          show="",
                                          halfway=halfway)

        # For training the critic, this is as far as we need to poceed with the computation.
        # self.img_to_features_w has stored computed feature maps inside the tensor store, which will then be retrieved by the critic
        if halfway == True:  # Warning: halfway must be True not truthy
            return None, None

        self.tensor_store.keep_inputs("F_w", F_W)
        self.tensor_store.keep_inputs("M_w", M_W)
        self.prof.tick("img_to_map_frame")

        # Accumulate the egocentric features in a global map
        reset_mask = firstseg if self.params["clear_history"] else None

        # Consider the space very near the drone and right under it as observed - draw ones on the observability mask
        # If we treat that space as unobserved, then there's going to be a gap in the visitation distribution, which
        # makes training with RL more difficult, as there is no reward feedback if the drone doesn't cross that gap.
        if self.params.get("cover_init_pos", False):
            StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks(
                M_W.shape[0], M_W.device)
            StartMasks_W, _ = self.map_transform_r_to_w(
                StartMasks_R, cam_poses, None)
            M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W)

        S_W, SM_W = self.map_accumulator_w(F_W,
                                           M_W,
                                           reset_mask=reset_mask,
                                           show="acc" if IMG_DBG else "")
        S_W_poses = g_poses
        self.prof.tick("map_accumulate")

        # If we're training Stage 2 with imitation learning from ground truth visitation distributions, we want to
        # compute observability masks with the same code that's used in Stage 1 to avoid mistakes.
        if halfway == "observability":
            map_uncoverage_w = 1 - SM_W
            return map_uncoverage_w

        # Throw away those timesteps that don't correspond to planning timesteps
        S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \
            self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan)

        #maps_m_prior_select, maps_m_posterior_select = None, None

        # Only process the maps on plannieng timesteps
        if len(S_W_select) == 0:
            return None

        self.tensor_store.keep_inputs("S_W_select", S_W_select)
        self.prof.tick("batch_select")

        # Process the map via the two map_procesors
        # Do grounding of objects in the map chosen to do so
        if self.use_aux["grounding_map"]:
            R_W_select, RS_W_poses_select = self.map_processor_grounding(
                S_W_select, sent_embeddings_select, S_W_poses_select, show="")
            self.tensor_store.keep_inputs("R_W_select", R_W_select)
            self.prof.tick("map_proc_gnd")
            # Concatenate grounding map and semantic map along channel dimension
            RS_W_select = torch.cat([S_W_select, R_W_select], 1)

        else:
            RS_W_select = S_W_select
            RS_W_poses_select = S_W_poses_select

        s_poses_select = start_poses_select if self.params[
            "predict_in_start_frame"] else cam_poses_select
        RS_S_select, RS_S_poses_select = self.map_transform_w_to_s(
            RS_W_select, RS_W_poses_select, s_poses_select)
        SM_S_select, SM_S_poses_select = self.map_transform_w_to_s(
            SM_W_select, S_W_poses_select, s_poses_select)

        assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame"

        self.tensor_store.keep_inputs("RS_S_select", RS_S_select)
        self.tensor_store.keep_inputs("SM_S_select", SM_S_select)
        self.prof.tick("transform_w_to_s")

        # Data augmentation for trajectory prediction
        map_poses_clean_select = None
        # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses
        if self.do_perturb_maps:
            assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps"
            RS_P_select, RS_P_poses_select = self.map_transform_s_to_p(
                RS_S_select, RS_S_poses_select, noisy_start_poses_select)
        else:
            RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select

        self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select)
        self.prof.tick("map_perturb")

        sent_embeddings_pp = sent_embeddings_select

        # Run lingunet on the map to predict visitation distribution scores (pre-softmax)
        # ---------
        log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet(
            RS_P_select,
            sent_embeddings_pp,
            RS_P_poses_select,
            tensor_store=self.tensor_store)
        # ---------

        self.prof.tick("pathpred")

        # TODO: Shouldn't we be transforming probability distributions instead of scores? Otherwise OOB space will have weird values
        # Transform distributions back to world reference frame and keep them (these are the model outputs)
        both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            None)
        log_v_dist_w_select = Partial2DDistribution(
            both_inner_w, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_w_select",
                                      log_v_dist_w_select)

        # Transform distributions back to start reference frame and keep them (for auxiliary objective)
        both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            start_poses_select)
        log_v_dist_s_select = Partial2DDistribution(
            both_inner_s, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_s_select",
                                      log_v_dist_s_select)

        # prime number will mean that it will alternate between sim and real
        if self.get_iter() % 23 == 0:
            lsfm = SpatialSoftmax2d()
            for i in range(S_W_select.shape[0]):
                Presenter().show_image(S_W_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_s_w_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_s_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_s_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_p_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_p_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_rs_p_select",
                                       scale=4,
                                       waitkey=1)
                break

        self.prof.tick("transform_back")

        # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from
        # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep
        # that had a trajectory prediction, rotated in the frame of timestep k.
        if select_only:
            # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps
            log_v_dist_w = log_v_dist_w_select
            v_dist_w_poses = v_dist_w_poses_select
        else:
            raise NotImplementedError("select_only must be True")

        return_list = [log_v_dist_w, v_dist_w_poses]
        if rl:
            internals_for_rl = {
                "map_coverage_w": SM_W,
                "map_uncoverage_w": 1 - SM_W
            }
            return_list.append(internals_for_rl)

        return tuple(return_list)

    def maybe_cuda(self, tensor):
        return tensor.to(next(self.parameters()).device)

    def cuda_var(self, tensor):
        return tensor.to(next(self.parameters()).device)

    def unbatch(self, batch, halfway=False):
        # Inputs
        images = self.maybe_cuda(batch["images"][0])
        seq_len = len(images)
        instructions = self.maybe_cuda(batch["instr"][0][:seq_len])
        instr_lengths = batch["instr_len"][0][:seq_len]
        states = self.maybe_cuda(batch["states"][0])

        if not halfway:

            plan_mask = batch["plan_mask"][
                0]  # True for every timestep that we do visitation prediction
            firstseg_mask = batch["firstseg_mask"][
                0]  # True for every timestep that is a new instruction segment

            # Labels (including for auxiliary losses)
            lm_pos_fpv = batch["lm_pos_fpv"][
                0]  # All object 2D coordinates in the first-person image
            lm_pos_map_m = batch["lm_pos_map"][
                0]  # All object 2D coordinates in the semantic map
            lm_indices = batch["lm_indices"][0]  # All object class indices
            goal_pos_map_m = batch["goal_loc"][
                0]  # Goal location in the world in meters_and_metrics
            lm_mentioned = batch["lm_mentioned"][
                0]  # 1/0 labels whether object was mentioned/not mentioned in template instruction
            # TODO: We're taking the FIRST label here. SINGLE SEGMENT ASSUMPTION
            lang_lm_mentioned = batch["lang_lm_mentioned"][0][
                0]  # integer labes as to which object was mentioned
            start_poses = batch["start_poses"][0]
            noisy_start_poses = get_noisy_poses_torch(
                start_poses.numpy(),
                self.params["pos_variance"],
                self.params["rot_variance"],
                cuda=False,
                cuda_device=None)

            # Ground truth visitation distributions (in start and global frames)
            v_dist_w_ground_truth_select = self.maybe_cuda(
                batch["traj_ground_truth"][0])
            start_poses_select = self.batch_select.one(
                start_poses, plan_mask, v_dist_w_ground_truth_select.device)
            v_dist_s_ground_truth_select, poses_s = self.map_transform_w_to_s(
                v_dist_w_ground_truth_select, None, start_poses_select)
            #self.tensor_store.keep_inputs("v_dist_w_ground_truth_select", v_dist_w_ground_truth_select)
            self.tensor_store.keep_inputs("v_dist_s_ground_truth_select",
                                          v_dist_s_ground_truth_select)
            #Presenter().show_image(v_dist_s_ground_truth_select.detach().cpu()[0,0], "v_dist_s_ground_truth_select", waitkey=1, scale=4)
            #Presenter().show_image(v_dist_w_ground_truth_select.detach().cpu()[0,0], "v_dist_w_ground_truth_select", waitkey=1, scale=4)

            lm_pos_map_px = [
                torch.from_numpy(
                    transformations.pos_m_to_px(p.numpy(),
                                                self.params["global_map_size"],
                                                self.params["world_size_m"],
                                                self.params["world_size_px"]))
                if p is not None else None for p in lm_pos_map_m
            ]
            goal_pos_map_px = torch.from_numpy(
                transformations.pos_m_to_px(goal_pos_map_m.numpy(),
                                            self.params["global_map_size"],
                                            self.params["world_size_m"],
                                            self.params["world_size_px"]))

            resnet_factor = self.img_to_features_w.img_to_features.get_downscale_factor(
            )
            lm_pos_fpv = [
                self.cuda_var(
                    (s / resnet_factor).long()) if s is not None else None
                for s in lm_pos_fpv
            ]
            lm_indices = [
                self.cuda_var(s) if s is not None else None for s in lm_indices
            ]
            lm_mentioned = [
                self.cuda_var(s) if s is not None else None
                for s in lm_mentioned
            ]
            lang_lm_mentioned = self.cuda_var(lang_lm_mentioned)
            lm_pos_map_px = [
                self.cuda_var(s.long()) if s is not None else None
                for s in lm_pos_map_px
            ]
            goal_pos_map_px = self.cuda_var(goal_pos_map_px)

            self.tensor_store.keep_inputs("lm_pos_fpv", lm_pos_fpv)
            self.tensor_store.keep_inputs("lm_pos_map", lm_pos_map_px)
            self.tensor_store.keep_inputs("lm_indices", lm_indices)
            self.tensor_store.keep_inputs("lm_mentioned", lm_mentioned)
            self.tensor_store.keep_inputs("lang_lm_mentioned",
                                          lang_lm_mentioned)
            self.tensor_store.keep_inputs("goal_pos_map", goal_pos_map_px)

            lm_pos_map_select = [
                lm_pos for i, lm_pos in enumerate(lm_pos_map_px)
                if plan_mask[i]
            ]
            lm_indices_select = [
                lm_idx for i, lm_idx in enumerate(lm_indices) if plan_mask[i]
            ]
            lm_mentioned_select = [
                lm_m for i, lm_m in enumerate(lm_mentioned) if plan_mask[i]
            ]
            goal_pos_map_select = [
                pos for i, pos in enumerate(goal_pos_map_px) if plan_mask[i]
            ]

            self.tensor_store.keep_inputs("lm_pos_map_select",
                                          lm_pos_map_select)
            self.tensor_store.keep_inputs("lm_indices_select",
                                          lm_indices_select)
            self.tensor_store.keep_inputs("lm_mentioned_select",
                                          lm_mentioned_select)
            self.tensor_store.keep_inputs("goal_pos_map_select",
                                          goal_pos_map_select)

        # We won't need this extra information
        else:
            noisy_poses, start_poses, noisy_start_poses = None, None, None
            plan_mask, firstseg_mask = None, None

        metadata = batch["md"][0][0]
        env_id = metadata["env_id"]
        self.tensor_store.set_flag("env_id", env_id)

        return images, states, instructions, instr_lengths, plan_mask, firstseg_mask, start_poses, noisy_start_poses, metadata

    # Forward pass for training
    def sup_loss_on_batch(self,
                          batch,
                          eval,
                          halfway=False,
                          grad_noise=False,
                          disable_losses=[]):
        self.prof.tick("out")
        self.reset()

        if batch is None:
            print("Skipping None Batch")
            zero = torch.zeros([1]).float().to(next(self.parameters()).device)
            return zero, self.tensor_store

        images, states, instructions, instr_len, plan_mask, firstseg_mask, \
         start_poses, noisy_start_poses, metadata = self.unbatch(batch, halfway=halfway)
        self.prof.tick("unbatch_inputs")

        # ----------------------------------------------------------------------------
        _ = self(images,
                 states,
                 instructions,
                 instr_len,
                 plan=plan_mask,
                 firstseg=firstseg_mask,
                 noisy_start_poses=start_poses if eval else noisy_start_poses,
                 start_poses=start_poses,
                 select_only=True,
                 halfway=halfway,
                 grad_noise=grad_noise)
        # ----------------------------------------------------------------------------

        if self.should_save_path_overlays:
            self.save_path_overlays(metadata)

        # If we run the model halfway, we only need to calculate features needed for the wasserstein loss
        # If we want to include more features in wasserstein critic, have to run the forward pass a bit further
        if halfway and not halfway == "v2":
            return None, self.tensor_store

        # The returned values are not used here - they're kept in the tensor store which is used as an input to a loss
        self.prof.tick("call")

        if not halfway:
            # Calculate goal-prediction accuracy:
            goal_pos = self.tensor_store.get_inputs_batch("goal_pos_map",
                                                          cat_not_stack=True)
            success_goal = self.goal_good_criterion(
                self.tensor_store.get_inputs_batch("log_v_dist_w_select",
                                                   cat_not_stack=True),
                goal_pos)
            acc = 1.0 if success_goal else 0.0
            self.goal_acc_meter.put(acc)
            goal_visible = self.goal_visible(
                self.tensor_store.get_inputs_batch("M_w", cat_not_stack=True),
                goal_pos)
            if goal_visible:
                self.visible_goal_acc_meter.put(acc)
            else:
                self.invisible_goal_acc_meter.put(acc)
            self.visible_goal_frac_meter.put(1.0 if goal_visible else 0.0)

            self.correct_goals += acc
            self.total_goals += 1

            self.prof.tick("goal_acc")

        if halfway == "v2":
            disable_losses = ["visitation_dist", "lang"]

        losses, metrics = self.losses.calculate_aux_loss(
            tensor_store=self.tensor_store,
            reduce_average=True,
            disable_losses=disable_losses)
        loss = self.losses.combine_losses(losses, self.aux_weights)

        self.prof.tick("calc_losses")

        prefix = self.model_name + ("/eval" if eval else "/train")
        iteration = self.get_iter()
        self.writer.add_dict(prefix, get_current_meters(), iteration)
        self.writer.add_dict(prefix, losses, iteration)
        self.writer.add_dict(prefix, metrics, iteration)

        if not halfway:
            self.writer.add_scalar(prefix + "/goal_accuracy",
                                   self.goal_acc_meter.get(), iteration)
            self.writer.add_scalar(prefix + "/visible_goal_accuracy",
                                   self.visible_goal_acc_meter.get(),
                                   iteration)
            self.writer.add_scalar(prefix + "/invisible_goal_accuracy",
                                   self.invisible_goal_acc_meter.get(),
                                   iteration)
            self.writer.add_scalar(prefix + "/visible_goal_fraction",
                                   self.visible_goal_frac_meter.get(),
                                   iteration)

        self.inc_iter()

        self.prof.tick("summaries")
        self.prof.loop()
        self.prof.print_stats(1)

        return loss, self.tensor_store

    def get_dataset(self,
                    data=None,
                    envs=None,
                    domain=None,
                    dataset_names=None,
                    dataset_prefix=None,
                    eval=False,
                    halfway_only=False):
        # TODO: Maybe use eval here
        data_sources = []
        # If we're running auxiliary objectives, we need to include the data sources for the auxiliary labels
        #if self.use_aux_class_features or self.use_aux_class_on_map or self.use_aux_grounding_features or self.use_aux_grounding_on_map:
        #if self.use_aux_goal_on_map:
        if not halfway_only:
            data_sources.append(aup.PROVIDER_LM_POS_DATA)
            data_sources.append(aup.PROVIDER_GOAL_POS)

            # Adding these in this order will compute poses with added noise and compute trajectory ground truth
            # in the reference frame of these noisy poses
            data_sources.append(aup.PROVIDER_START_POSES)

            if self.do_perturb_maps:
                print("PERTURBING MAPS!")
                # TODO: The noisy poses from the provider are not actually used!! Those should replace states instead!
                data_sources.append(aup.PROVIDER_NOISY_POSES)
                # TODO: Think this through. Perhaps we actually want dynamic ground truth given a noisy start position
                if self.params["predict_in_start_frame"]:
                    data_sources.append(
                        aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_STATIC)
                else:
                    data_sources.append(
                        aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_DYNAMIC_NOISY)
            else:
                print("NOT Perturbing Maps!")
                data_sources.append(aup.PROVIDER_NOISY_POSES)
                if self.params["predict_in_start_frame"]:
                    data_sources.append(
                        aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_STATIC)
                else:
                    data_sources.append(
                        aup.PROVIDER_TRAJECTORY_GROUND_TRUTH_DYNAMIC)

            data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED)

            templates = get_current_parameters()["Environment"]["templates"]
            if templates:
                data_sources.append(aup.PROVIDER_LANG_TEMPLATE)

        return SegmentDataset(data=data,
                              env_list=envs,
                              domain=domain,
                              dataset_names=dataset_names,
                              dataset_prefix=dataset_prefix,
                              aux_provider_names=data_sources,
                              segment_level=True)
Exemplo n.º 5
0
    def __init__(self, run_name="", model_class=MODEL_RSS,
                 aux_class_features=False, aux_grounding_features=False,
                 aux_class_map=False, aux_grounding_map=False, aux_goal_map=False,
                 aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False):

        super(ModelTrajectoryTopDown, self).__init__()
        self.model_name = "sm_trajectory" + str(model_class)
        self.model_class = model_class
        print("Init model of type: ", str(model_class))
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Auxiliary Objectives
        self.use_aux_class_features = aux_class_features
        self.use_aux_grounding_features = aux_grounding_features
        self.use_aux_class_on_map = aux_class_map
        self.use_aux_grounding_on_map = aux_grounding_map
        self.use_aux_goal_on_map = aux_goal_map
        self.use_aux_lang = aux_lang
        self.use_aux_traj_on_map = aux_traj
        self.use_aux_reg_map = self.aux_weights["regularize_map"]

        self.use_rot_noise = rot_noise
        self.use_pos_noise = pos_noise


        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        if self.use_aux_grounding_on_map and not self.use_aux_grounding_features:
            self.map_processor_a_w = LangFilterMapProcessor(
                source_map_size=self.params["global_map_size"],
                world_size=self.params["world_size_px"],
                embed_size=self.params["emb_size"],
                in_channels=self.params["feature_channels"],
                out_channels=self.params["relevance_channels"],
                spatial=False, cat_out=True)
        else:
            self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"])

        if self.use_aux_goal_on_map:
            self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"],
                                                            world_size=self.params["world_size_px"],
                                                            embed_size=self.params["emb_size"],
                                                            in_channels=self.params["relevance_channels"],
                                                            out_channels=self.params["goal_channels"],
                                                            spatial=True, cat_out=True)
        else:
            self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"],
                                                          world_size=self.params["world_size_px"])

        pred_channels = self.params["goal_channels"] + self.params["relevance_channels"]

        # Common
        # --------------------------------------------------------------------------------------------------------------

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"])

        self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"],
                                                       dest_map_size=self.params["local_map_size"],
                                                       world_size=self.params["world_size_px"])
        self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"],
                                                       dest_map_size=self.params["global_map_size"],
                                                       world_size=self.params["world_size_px"])

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"])

        # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not
        enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False)

        # Output an action given the global semantic map
        if self.params["map_to_action"] == "downsample2":
            self.map_to_action = EgoMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"])

        elif self.params["map_to_action"] == "cropped":
            self.map_to_action = CroppedMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"]
            )

        # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training
        # and finetuned on all timesteps end-to-end
        enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True)

        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if aux_class_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_class", None,  self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "fpv_features", "lm_pos_fpv", "lm_indices"))
        if aux_grounding_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"],
                                                "fpv_features_g", "lm_pos_fpv", "lm_mentioned"))
        if aux_class_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "map_s_w_select", "lm_pos_map_select", "lm_indices_select"))
        if aux_grounding_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"],
                                                "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select"))
        if aux_goal_map:
            self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"],
                                               "map_b_w", "goal_pos_map"))
        # RSS model uses templated data for landmark and side prediction
        if self.use_aux_lang and self.params["templates"]:
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1,
                                                "sentence_embed", "lm_mentioned_tplt"))
            self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1,
                                                "sentence_embed", "side_mentioned_tplt"))
        # CoRL model uses alignment-model groundings
        elif self.use_aux_lang:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"],
                                                "sentence_embed", "lang_lm_mentioned"))
        if self.use_aux_traj_on_map:
            self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select"))

        if self.use_aux_reg_map:
            self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1",
                                                                "map_s_w_select", "lm_pos_map_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2)
        self.goal_acc_meter = MovingAverageMeter(10)

        self.print_auxiliary_info()

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0
Exemplo n.º 6
0
class ModelTrajectoryTopDown(ModuleWithAuxiliaries):

    def __init__(self, run_name="", model_class=MODEL_RSS,
                 aux_class_features=False, aux_grounding_features=False,
                 aux_class_map=False, aux_grounding_map=False, aux_goal_map=False,
                 aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False):

        super(ModelTrajectoryTopDown, self).__init__()
        self.model_name = "sm_trajectory" + str(model_class)
        self.model_class = model_class
        print("Init model of type: ", str(model_class))
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Auxiliary Objectives
        self.use_aux_class_features = aux_class_features
        self.use_aux_grounding_features = aux_grounding_features
        self.use_aux_class_on_map = aux_class_map
        self.use_aux_grounding_on_map = aux_grounding_map
        self.use_aux_goal_on_map = aux_goal_map
        self.use_aux_lang = aux_lang
        self.use_aux_traj_on_map = aux_traj
        self.use_aux_reg_map = self.aux_weights["regularize_map"]

        self.use_rot_noise = rot_noise
        self.use_pos_noise = pos_noise


        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        if self.use_aux_grounding_on_map and not self.use_aux_grounding_features:
            self.map_processor_a_w = LangFilterMapProcessor(
                source_map_size=self.params["global_map_size"],
                world_size=self.params["world_size_px"],
                embed_size=self.params["emb_size"],
                in_channels=self.params["feature_channels"],
                out_channels=self.params["relevance_channels"],
                spatial=False, cat_out=True)
        else:
            self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"])

        if self.use_aux_goal_on_map:
            self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"],
                                                            world_size=self.params["world_size_px"],
                                                            embed_size=self.params["emb_size"],
                                                            in_channels=self.params["relevance_channels"],
                                                            out_channels=self.params["goal_channels"],
                                                            spatial=True, cat_out=True)
        else:
            self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"],
                                                          world_size=self.params["world_size_px"])

        pred_channels = self.params["goal_channels"] + self.params["relevance_channels"]

        # Common
        # --------------------------------------------------------------------------------------------------------------

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"])

        self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"],
                                                       dest_map_size=self.params["local_map_size"],
                                                       world_size=self.params["world_size_px"])
        self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"],
                                                       dest_map_size=self.params["global_map_size"],
                                                       world_size=self.params["world_size_px"])

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"])

        # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not
        enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False)

        # Output an action given the global semantic map
        if self.params["map_to_action"] == "downsample2":
            self.map_to_action = EgoMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"])

        elif self.params["map_to_action"] == "cropped":
            self.map_to_action = CroppedMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"]
            )

        # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training
        # and finetuned on all timesteps end-to-end
        enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True)

        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if aux_class_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_class", None,  self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "fpv_features", "lm_pos_fpv", "lm_indices"))
        if aux_grounding_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"],
                                                "fpv_features_g", "lm_pos_fpv", "lm_mentioned"))
        if aux_class_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "map_s_w_select", "lm_pos_map_select", "lm_indices_select"))
        if aux_grounding_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"],
                                                "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select"))
        if aux_goal_map:
            self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"],
                                               "map_b_w", "goal_pos_map"))
        # RSS model uses templated data for landmark and side prediction
        if self.use_aux_lang and self.params["templates"]:
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1,
                                                "sentence_embed", "lm_mentioned_tplt"))
            self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1,
                                                "sentence_embed", "side_mentioned_tplt"))
        # CoRL model uses alignment-model groundings
        elif self.use_aux_lang:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"],
                                                "sentence_embed", "lang_lm_mentioned"))
        if self.use_aux_traj_on_map:
            self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select"))

        if self.use_aux_reg_map:
            self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1",
                                                                "map_s_w_select", "lm_pos_map_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2)
        self.goal_acc_meter = MovingAverageMeter(10)

        self.print_auxiliary_info()

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0

    # TODO: Try to hide these in a superclass or something. They take up a lot of space:
    def cuda(self, device=None):
        ModuleWithAuxiliaries.cuda(self, device)
        self.sentence_embedding.cuda(device)
        self.map_accumulator_w.cuda(device)
        self.map_processor_a_w.cuda(device)
        self.map_processor_b_r.cuda(device)
        self.img_to_features_w.cuda(device)
        self.map_to_action.cuda(device)
        self.action_loss.cuda(device)
        self.map_batch_fill_missing.cuda(device)
        self.map_transform_w_to_r.cuda(device)
        self.map_transform_r_to_w.cuda(device)
        self.batch_select.cuda(device)
        self.map_batch_fill_missing.cuda(device)
        return self

    def get_iter(self):
        return int(self.iter.data[0])

    def inc_iter(self):
        self.iter += 1

    def init_weights(self):
        self.img_to_features_w.init_weights()
        self.map_accumulator_w.init_weights()
        self.sentence_embedding.init_weights()
        self.map_to_action.init_weights()
        self.map_processor_a_w.init_weights()
        self.map_processor_b_r.init_weights()

    def reset(self):
        # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules
        super(ModelTrajectoryTopDown, self).reset()
        self.sentence_embedding.reset()
        self.img_to_features_w.reset()
        self.map_accumulator_w.reset()
        self.map_processor_a_w.reset()
        self.map_processor_b_r.reset()
        self.map_transform_w_to_r.reset()
        self.map_transform_r_to_w.reset()
        self.map_batch_fill_missing.reset()
        self.prev_instruction = None

    def setEnvContext(self, context):
        print("Set env context to: " + str(context))
        self.env_id = context["env_id"]

    def save_viz(self, images_in):
        imsave(get_viz_dir() + "fpv_" + str(self.seq_step) + ".png", images_in)
        features_cam = self.get_inputs_batch("fpv_features")[-1, 0, 0:3]
        save_tensor_as_img(features_cam, "F_c", self.env_id)
        feature_map_torch = self.get_inputs_batch("f_w")[-1, 0, 0:3]
        save_tensor_as_img(feature_map_torch, "F_w", self.env_id)
        coverage_map_torch = self.get_inputs_batch("m_w")[-1, 0, 0:3]
        save_tensor_as_img(coverage_map_torch, "M_w", self.env_id)
        semantic_map_torch = self.get_inputs_batch("map_s_w_select")[-1, 0, 0:3]
        save_tensor_as_img(semantic_map_torch, "S_w", self.env_id)
        relmap_torch = self.get_inputs_batch("map_a_w_select")[-1, 0, 0:3]
        save_tensor_as_img(relmap_torch, "R_w", self.env_id)
        relmap_r_torch = self.get_inputs_batch("map_a_r_select")[-1, 0, 0:3]
        save_tensor_as_img(relmap_r_torch, "R_r", self.env_id)
        goalmap_torch = self.get_inputs_batch("map_b_w_select")[-1, 0, 0:3]
        save_tensor_as_img(goalmap_torch, "G_w", self.env_id)
        goalmap_r_torch = self.get_inputs_batch("map_b_r_select")[-1, 0, 0:3]
        save_tensor_as_img(goalmap_r_torch, "G_r", self.env_id)

        action = self.get_inputs_batch("action")[-1].data.cpu().squeeze().numpy()
        action_fname = self.get_viz_dir() + "action_" + str(self.seq_step) + ".png"
        Presenter().save_action(action, action_fname, "")

    def get_action(self, state, instruction):
        """
        Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop)
        :param state: DroneState object with the raw image from the simulator
        :param instruction: Tokenized instruction given the corpus
        #TODO: Absorb corpus within model
        :return:
        """
        # TODO: Simplify this
        self.eval()
        images_np_pure = state.image
        state_np = state.state

        #print("Act: " + debug_untokenize_instruction(instruction))

        images_np = standardize_image(images_np_pure)
        image_fpv = Variable(none_padded_seq_to_tensor([images_np]))
        state = Variable(none_padded_seq_to_tensor([state_np]))
        # Add the batch dimension

        first_step = True
        if instruction == self.prev_instruction:
            first_step = False
        self.prev_instruction = instruction

        img_in_t = image_fpv
        img_in_t.volatile = True

        instr_len = [len(instruction)] if instruction is not None else None
        instruction = torch.LongTensor(instruction).unsqueeze(0)
        instruction = cuda_var(instruction, self.is_cuda, self.cuda_device)

        state.volatile = True

        if self.is_cuda:
            if img_in_t is not None:
                img_in_t = img_in_t.cuda(self.cuda_device)
            state = state.cuda(self.cuda_device)

        step_enc = None
        plan_now = None

        self.seq_step += 1

        action = self(img_in_t, state, instruction, instr_len, plan=plan_now, pos_enc=step_enc)

        # Save materials for paper and presentation
        if False:
            self.save_viz(images_np_pure)

        output_action = action.squeeze().data.cpu().numpy()
        stop_prob = output_action[3]
        output_stop = 1 if stop_prob > 0.5 else 0
        output_action[3] = output_stop

        return output_action

    def deterministic_action(self, action_mean, action_std, stop_prob):
        batch_size = action_mean.size(0)
        action = Variable(empty_float_tensor((batch_size, 4), self.is_cuda, self.cuda_device))
        action[:, 0:3] = action_mean[:, 0:3]
        action[:, 3] = stop_prob
        return action

    def sample_action(self, action_mean, action_std, stop_prob):
        action = torch.normal(action_mean, action_std)
        stop = torch.bernoulli(stop_prob)
        return action, stop

    # This is called before beginning an execution sequence
    def start_sequence(self):
        self.seq_step = 0
        self.reset()
        print("RESETTED!")
        return

    # TODO: Move this somewhere and standardize
    def cam_poses_from_states(self, states):
        cam_pos = states[:, 9:12]
        cam_rot = states[:, 12:16]

        pos_variance = 0
        rot_variance = 0
        if self.use_pos_noise:
            pos_variance = self.params["noisy_pos_variance"]
        if self.use_rot_noise:
            rot_variance = self.params["noisy_rot_variance"]

        pose = Pose(cam_pos, cam_rot)
        if self.use_pos_noise or self.use_rot_noise:
            pose = get_noisy_poses_torch(pose, pos_variance, rot_variance, cuda=self.is_cuda, cuda_device=self.cuda_device)
        return pose

    def forward(self, images, states, instructions, instr_lengths, has_obs=None, plan=None, save_maps_only=False, pos_enc=None, noisy_poses=None):
        """
        :param images: BxCxHxW batch of images (observations)
        :param states: BxK batch of drone states
        :param instructions: BxM LongTensor where M is the maximum length of any instruction
        :param instr_lengths: list of len B of integers, indicating length of each instruction
        :param has_obs: list of booleans of length B indicating whether the given element in the sequence has an observation
        :param yield_semantic_maps: If true, will not compute actions (full model), but return the semantic maps that
            were built along the way in response to the images. This is ugly, but allows code reuse
        :return:
        """
        cam_poses = self.cam_poses_from_states(states)
        g_poses = None#[None for pose in cam_poses]
        self.prof.tick("out")

        #print("Trn: " + debug_untokenize_instruction(instructions[0].data[:instr_lengths[0]]))

        # Calculate the instruction embedding
        if instructions is not None:
            # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state
            sent_embeddings = self.sentence_embedding(instructions, instr_lengths)
            self.keep_inputs("sentence_embed", sent_embeddings)
        else:
            sent_embeddings = self.sentence_embedding.get()

        self.prof.tick("embed")

        # Extract and project features onto the egocentric frame for each image
        features_w, coverages_w = self.img_to_features_w(images, cam_poses, sent_embeddings, self, show="")
        self.prof.tick("img_to_map_frame")
        self.keep_inputs("f_w", features_w)
        self.keep_inputs("m_w", coverages_w)

        # Accumulate the egocentric features in a global map
        maps_w = self.map_accumulator_w(features_w, coverages_w, add_mask=has_obs, show="acc" if IMG_DBG else "")
        map_poses_w = g_poses

        # TODO: Maybe keep maps_w if necessary
        #self.keep_inputs("map_sm_local", maps_m)
        self.prof.tick("map_accumulate")

        # Throw away those timesteps that don't correspond to planning timesteps
        maps_w_select, map_poses_w_select, cam_poses_select, noisy_poses_select, _, sent_embeddings_select, pos_enc = \
            self.batch_select(maps_w, map_poses_w, cam_poses, noisy_poses, None, sent_embeddings, pos_enc, plan)

        # Only process the maps on planning timesteps
        if len(maps_w_select) > 0:
            self.keep_inputs("map_s_w_select", maps_w_select)
            self.prof.tick("batch_select")

            # Process the map via the two map_procesors
            # Do grounding of objects in the map chosen to do so
            maps_w_select, map_poses_w_select = self.map_processor_a_w(maps_w_select, sent_embeddings_select, map_poses_w_select, show="")
            self.keep_inputs("map_a_w_select", maps_w_select)

            self.prof.tick("map_proc_gnd")

            self.map_transform_w_to_r.set_maps(maps_w_select, map_poses_w_select)
            maps_m_select, map_poses_m_select = self.map_transform_w_to_r.get_maps(cam_poses_select)

            self.keep_inputs("map_a_r_select", maps_w_select)
            self.prof.tick("transform_w_to_r")

            self.keep_inputs("map_a_r_perturbed_select", maps_m_select)

            self.prof.tick("map_perturb")

            # Include positional encoding for path prediction
            if pos_enc is not None:
                sent_embeddings_pp = torch.cat([sent_embeddings_select, pos_enc.unsqueeze(1)], dim=1)
            else:
                sent_embeddings_pp = sent_embeddings_select

            # Process the map via the two map_procesors (e.g. predict the trajectory that we'll be taking)
            maps_m_select, map_poses_m_select = self.map_processor_b_r(maps_m_select, sent_embeddings_pp, map_poses_m_select)

            self.keep_inputs("map_b_r_select", maps_m_select)

            if True:
                self.map_transform_r_to_w.set_maps(maps_m_select, map_poses_m_select)
                maps_b_w_select, _ = self.map_transform_r_to_w.get_maps(None)
                self.keep_inputs("map_b_w_select", maps_b_w_select)

            self.prof.tick("map_proc_b")

        else:
            maps_m_select = None

        maps_m, map_poses_m = self.map_batch_fill_missing(maps_m_select, cam_poses, plan, show="")
        self.keep_inputs("map_b_r", maps_m)
        self.prof.tick("map_fill_missing")

        # Keep global maps for auxiliary objectives if necessary
        if self.input_required("map_b_w"):
            maps_b, _ = self.map_processor_b_r.get_maps(g_poses)
            self.keep_inputs("map_b_w", maps_b)

        self.prof.tick("keep_global_maps")

        if run_metadata.IS_ROLLOUT:
            pass
            #Presenter().show_image(maps_m.data[0, 0:3], "plan_map_now", torch=True, scale=4, waitkey=1)
            #Presenter().show_image(maps_w.data[0, 0:3], "sm_map_now", torch=True, scale=4, waitkey=1)
        self.prof.tick("viz")

        # Output the final action given the processed map
        action_pred = self.map_to_action(maps_m, sent_embeddings)
        out_action = self.deterministic_action(action_pred[:, 0:3], None, action_pred[:, 3])

        self.keep_inputs("action", out_action)
        self.prof.tick("map_to_action")

        return out_action

    # TODO: The below two methods seem to do the same thing
    def maybe_cuda(self, tensor):
        if self.is_cuda:
            return tensor.cuda()
        else:
            return tensor

    def cuda_var(self, tensor):
        return cuda_var(tensor, self.is_cuda, self.cuda_device)

    # Forward pass for training (with batch optimizations
    def sup_loss_on_batch(self, batch, eval):
        self.prof.tick("out")

        action_loss_total = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device))

        if batch is None:
            print("Skipping None Batch")
            return action_loss_total

        images = self.maybe_cuda(batch["images"])

        instructions = self.maybe_cuda(batch["instr"])
        instr_lengths = batch["instr_len"]
        states = self.maybe_cuda(batch["states"])
        actions = self.maybe_cuda(batch["actions"])

        # Auxiliary labels
        lm_pos_fpv = batch["lm_pos_fpv"]
        lm_pos_map = batch["lm_pos_map"]
        lm_indices = batch["lm_indices"]
        goal_pos_map = batch["goal_loc"]

        TEMPLATES = True
        if TEMPLATES:
            lm_mentioned_tplt = batch["lm_mentioned_tplt"]
            side_mentioned_tplt = batch["side_mentioned_tplt"]
        else:
            lm_mentioned = batch["lm_mentioned"]
            lang_lm_mentioned = batch["lang_lm_mentioned"]

        # stops = self.maybe_cuda(batch["stops"])
        masks = self.maybe_cuda(batch["masks"])
        # This is the first-timestep metadata
        metadata = batch["md"]

        seq_len = images.size(1)
        batch_size = images.size(0)
        count = 0
        correct_goal_count = 0
        goal_count = 0

        # Loop thru batch
        for b in range(batch_size):
            seg_idx = -1

            self.reset()

            self.prof.tick("out")
            b_seq_len = len_until_nones(metadata[b])

            # TODO: Generalize this
            # Slice the data according to the sequence length
            b_metadata = metadata[b][:b_seq_len]
            b_images = images[b][:b_seq_len]
            b_instructions = instructions[b][:b_seq_len]
            b_instr_len = instr_lengths[b][:b_seq_len]
            b_states = states[b][:b_seq_len]
            b_actions = actions[b][:b_seq_len]
            b_lm_pos_fpv = lm_pos_fpv[b][:b_seq_len]
            b_lm_pos_map = lm_pos_map[b][:b_seq_len]
            b_lm_indices = lm_indices[b][:b_seq_len]
            b_goal_pos = goal_pos_map[b][:b_seq_len]
            if not TEMPLATES:
                b_lang_lm_mentioned = lang_lm_mentioned[b][:b_seq_len]
                b_lm_mentioned = lm_mentioned[b][:b_seq_len]

            b_lm_pos_map = [self.cuda_var(s.long()) if s is not None else None for s in b_lm_pos_map]
            b_lm_pos_fpv = [self.cuda_var((s / RESNET_FACTOR).long()) if s is not None else None for s in b_lm_pos_fpv]
            b_lm_indices = [self.cuda_var(s) if s is not None else None for s in b_lm_indices]
            b_goal_pos = self.cuda_var(b_goal_pos)
            if not TEMPLATES:
                b_lang_lm_mentioned = self.cuda_var(b_lang_lm_mentioned)
                b_lm_mentioned = [self.cuda_var(s) if s is not None else None for s in b_lm_mentioned]

            # TODO: Figure out how to keep these properly. Perhaps as a whole batch is best
            # TODO: Introduce a key-value store (encapsulate instead of inherit)
            self.keep_inputs("lm_pos_fpv", b_lm_pos_fpv)
            self.keep_inputs("lm_pos_map", b_lm_pos_map)
            self.keep_inputs("lm_indices", b_lm_indices)
            self.keep_inputs("goal_pos_map", b_goal_pos)
            if not TEMPLATES:
                self.keep_inputs("lang_lm_mentioned", b_lang_lm_mentioned)
                self.keep_inputs("lm_mentioned", b_lm_mentioned)

            # TODO: Abstract all of these if-elses in a modular way once we know which ones are necessary
            if TEMPLATES:
                b_lm_mentioned_tplt = lm_mentioned_tplt[b][:b_seq_len]
                b_side_mentioned_tplt = side_mentioned_tplt[b][:b_seq_len]
                b_side_mentioned_tplt = self.cuda_var(b_side_mentioned_tplt)
                b_lm_mentioned_tplt = self.cuda_var(b_lm_mentioned_tplt)
                self.keep_inputs("lm_mentioned_tplt", b_lm_mentioned_tplt)
                self.keep_inputs("side_mentioned_tplt", b_side_mentioned_tplt)

                b_lm_mentioned = b_lm_mentioned_tplt


            b_obs_mask = [True for _ in range(b_seq_len)]
            b_plan_mask = [True for _ in range(b_seq_len)]
            b_plan_mask_t_cpu = torch.Tensor(b_plan_mask) == True
            b_plan_mask_t = self.maybe_cuda(b_plan_mask_t_cpu)
            b_pos_enc = None

            # ----------------------------------------------------------------------------
            # Optional Auxiliary Inputs
            # ----------------------------------------------------------------------------
            if self.input_required("lm_pos_map_select"):
                b_lm_pos_map_select = [lm_pos for i,lm_pos in enumerate(b_lm_pos_map) if b_plan_mask[i]]
                self.keep_inputs("lm_pos_map_select", b_lm_pos_map_select)
            if self.input_required("lm_indices_select"):
                b_lm_indices_select = [lm_idx for i,lm_idx in enumerate(b_lm_indices) if b_plan_mask[i]]
                self.keep_inputs("lm_indices_select", b_lm_indices_select)
            if self.input_required("lm_mentioned_select"):
                b_lm_mentioned_select = [lm_m for i,lm_m in enumerate(b_lm_mentioned) if b_plan_mask[i]]
                self.keep_inputs("lm_mentioned_select", b_lm_mentioned_select)

            # ----------------------------------------------------------------------------

            self.prof.tick("inputs")

            actions = self(b_images, b_states, b_instructions, b_instr_len,
                           has_obs=b_obs_mask, plan=b_plan_mask, pos_enc=b_pos_enc)

            action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False)

            self.prof.tick("call")

            action_losses = self.action_loss.batch_reduce_loss(action_losses)
            action_loss = self.action_loss.reduce_loss(action_losses)

            action_loss_total = action_loss
            count += b_seq_len

            self.prof.tick("loss")

        action_loss_avg = action_loss_total / (count + 1e-9)

        self.prof.tick("out")

        # Doing this in the end (outside of se
        aux_losses = self.calculate_aux_loss(reduce_average=True)
        aux_loss = self.combine_aux_losses(aux_losses, self.aux_weights)

        prefix = self.model_name + ("/eval" if eval else "/train")

        self.writer.add_dict(prefix, get_current_meters(), self.get_iter())
        self.writer.add_dict(prefix, aux_losses, self.get_iter())
        self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu()[0], self.get_iter())
        # TODO: Log value here
        self.writer.add_scalar(prefix + "/goal_accuracy", self.goal_acc_meter.get(), self.get_iter())

        self.prof.tick("auxiliaries")

        total_loss = action_loss_avg + aux_loss

        self.inc_iter()

        self.prof.tick("summaries")
        self.prof.loop()
        self.prof.print_stats(1)

        return total_loss

    def get_dataset(self, data=None, envs=None, dataset_name=None, eval=False):
        # TODO: Maybe use eval here
        #if self.fpv:
        data_sources = []
        # If we're running auxiliary objectives, we need to include the data sources for the auxiliary labels
        #if self.use_aux_class_features or self.use_aux_class_on_map or self.use_aux_grounding_features or self.use_aux_grounding_on_map:
        #if self.use_aux_goal_on_map:
        data_sources.append(aup.PROVIDER_LM_POS_DATA)
        data_sources.append(aup.PROVIDER_GOAL_POS)
        #data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED)
        data_sources.append(aup.PROVIDER_LANG_TEMPLATE)

        #if self.use_rot_noise or self.use_pos_noise:
        #    data_sources.append(aup.PROVIDER_POSE_NOISE)

        return SegmentDataset(data=data, env_list=envs, dataset_name=dataset_name, aux_provider_names=data_sources, segment_level=True)
Exemplo n.º 7
0
class ModelChaplot(ModuleWithAuxiliaries):
    def __init__(self, run_name=""):

        super(ModelChaplot, self).__init__()
        self.model_name = "chaplot"
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)
        self.trajectory_len = get_current_parameters(
        )["Setup"]["trajectory_length"]

        self.image_module = ChaplotImageModule(
            image_emb_size=self.params["image_emb_size"],
            input_num_channels=3,
            image_height=self.params["img_w"],
            image_width=self.params["img_h"],
            using_recurrence=True)

        self.image_recurrence_module = IncrementalRecurrenceChaplotModule(
            input_emb_dim=256, output_emb_dim=256)

        self.text_module = ChaplotTextModule(
            emb_dim=self.params["word_emb_size"],
            hidden_dim=self.params["emb_size"],
            vocab_size=self.params["vocab_size"],
            image_height=2,
            image_width=6)
        # TODO: check image width and height

        self.final_module = IncrementalMultimodalChaplotModule(
            image_module=self.image_module,
            image_recurrence_module=self.image_recurrence_module,
            text_module=self.text_module,
            max_episode_length=self.trajectory_len,
            final_image_height=2,
            final_image_width=6)

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0
        self.model_state = None
        self.image_emb_seq = None
        self.state_feature = None

    # TODO: Try to hide these in a superclass or something. They take up a lot of space:
    def cuda(self, device=None):
        ModuleWithAuxiliaries.cuda(self, device)
        self.image_module.cuda(device)
        self.image_recurrence_module.cuda(device)
        self.text_module.cuda(device)
        self.final_module.cuda(device)
        self.action_loss.cuda(device)
        return self

    def get_iter(self):
        return int(self.iter.data[0])

    def inc_iter(self):
        self.iter += 1

    def init_weights(self):
        self.text_module.init_weights()
        self.image_recurrence_module.init_weights()
        self.image_module.init_weights()
        self.final_module.init_weights()

    def reset(self):
        # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules
        super(ModelChaplot, self).reset()
        self.seq_step = 0
        self.model_state = None
        self.image_emb_seq = None
        self.state_feature = None
        print("CHAPLOT RESET")
        pass

    def setEnvContext(self, context):
        print("Set env context to: " + str(context))
        self.env_id = context["env_id"]

    def start_segment_rollout(self, *args):
        self.reset()

    def get_action(self, state, instruction):
        """
        Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop)
        :param state: DroneState object with the raw image from the simulator
        :param instruction: Tokenized instruction given the corpus
        #TODO: Absorb corpus within model
        :return:
        """
        # TODO: Simplify this
        self.eval()
        images_np_pure = state.image
        state_np = state.state

        #print("Act: " + debug_untokenize_instruction(instruction))

        images_np = standardize_image(images_np_pure)
        image_fpv = Variable(none_padded_seq_to_tensor([images_np]))
        state = Variable(none_padded_seq_to_tensor([state_np]))
        # Add the batch dimension

        first_step = True
        if instruction == self.prev_instruction:
            first_step = False
        self.prev_instruction = instruction

        img_in_t = image_fpv
        img_in_t.volatile = True

        instr_len = [len(instruction)] if instruction is not None else None
        for tok in instruction:
            if tok >= self.params["vocab_size"] or tok < 0:
                raise Exception("Word embeddings out of bounds")
        instruction = torch.LongTensor(instruction).unsqueeze(0)
        instruction = cuda_var(instruction, self.is_cuda, self.cuda_device)

        state.volatile = True

        if self.is_cuda:
            img_in_t = img_in_t.cuda(self.cuda_device)

        self.seq_step += 1

        action = self(img_in_t, instruction, instr_len)

        output_action = action.squeeze().data.cpu().numpy()
        stop_prob = output_action[3]
        output_stop = 1 if (stop_prob > 0.5
                            or self.seq_step >= self.trajectory_len - 5) else 0
        output_action[3] = output_stop

        #print("action: ", output_action)

        return output_action

    def deterministic_action(self, action_mean, action_std, stop_prob):
        batch_size = action_mean.size(0)
        action = Variable(
            empty_float_tensor((batch_size, 4), self.is_cuda,
                               self.cuda_device))
        action[:, 0:3] = action_mean[:, 0:3]
        action[:, 3] = stop_prob
        return action

    def sample_action(self, action_mean, action_std, stop_prob):
        action = torch.normal(action_mean, action_std)
        stop = torch.bernoulli(stop_prob)
        return action, stop

    # This is called before beginning an execution sequence
    def start_sequence(self):
        self.seq_step = 0
        self.reset()
        print("RESETTED!")
        return

    # TODO: Move this somewhere and standardize
    def cam_poses_from_states(self, states):
        cam_pos = states[:, 9:12]
        cam_rot = states[:, 12:16]
        pose = Pose(cam_pos, cam_rot)
        return pose

    def instructions_to_dipandrew(self, instructions, instr_lengths):
        out = []
        for i in range(len(instructions)):
            instr_i = instructions[i:i + 1, 0:instr_lengths[i]]
            out.append(instr_i)
        return out

    def forward(self, images, instructions, instr_lengths):

        seq_len = len(images)

        instr_dipandrew = self.instructions_to_dipandrew(
            instructions, instr_lengths)

        # Add sequence dimension, since we're treating batches as sequences
        images = images.unsqueeze(0)

        all_actions = []
        for i in range(seq_len):
            time_in = np.asarray([self.seq_step])
            time_in = Variable(
                self.maybe_cuda(torch.from_numpy(time_in).long()))
            action_i, self.model_state = self.final_module(
                images[0:1, i:i + 1], instr_dipandrew[i], time_in,
                self.model_state)

            self.seq_step += 1
            all_actions.append(action_i)

        actions = torch.cat(all_actions, dim=0)
        return actions

    def maybe_cuda(self, tensor):
        if self.is_cuda:
            return tensor.cuda()
        else:
            return tensor

    def cuda_var(self, tensor):
        return cuda_var(tensor, self.is_cuda, self.cuda_device)

    # Forward pass for training (with batch optimizations
    def sup_loss_on_batch(self, batch, eval):
        self.prof.tick("out")

        action_loss_total = Variable(
            empty_float_tensor([1], self.is_cuda, self.cuda_device))

        if batch is None:
            print("Skipping None Batch")
            return action_loss_total

        images = self.maybe_cuda(batch["images"])
        instructions = self.maybe_cuda(batch["instr"])
        instr_lengths = batch["instr_len"]
        actions = self.maybe_cuda(batch["actions"])

        metadata = batch["md"]

        batch_size = images.size(0)
        count = 0

        # Loop thru batch
        for b in range(batch_size):
            self.reset()
            self.prof.tick("out")
            b_seq_len = len_until_nones(metadata[b])

            # TODO: Generalize this
            # Slice the data according to the sequence length
            b_metadata = metadata[b][:b_seq_len]
            b_images = images[b][:b_seq_len]
            b_instructions = instructions[b][:b_seq_len]
            b_instr_len = instr_lengths[b][:b_seq_len]
            b_actions = actions[b][:b_seq_len]

            # ----------------------------------------------------------------------------

            self.prof.tick("inputs")

            actions = self(b_images, b_instructions, b_instr_len)

            action_losses, _ = self.action_loss(b_actions,
                                                actions,
                                                batchreduce=False)

            self.prof.tick("call")
            action_losses = self.action_loss.batch_reduce_loss(action_losses)
            action_loss = self.action_loss.reduce_loss(action_losses)
            action_loss_total = action_loss
            count += b_seq_len

            self.prof.tick("loss")

        action_loss_avg = action_loss_total / (count + 1e-9)

        self.prof.tick("out")

        prefix = self.model_name + ("/eval" if eval else "/train")

        self.writer.add_dict(prefix, get_current_meters(), self.get_iter())
        self.writer.add_scalar(prefix + "/action_loss",
                               action_loss_avg.data.cpu()[0], self.get_iter())

        total_loss = action_loss_avg

        self.inc_iter()

        self.prof.loop()
        self.prof.print_stats(1)

        return total_loss

    def get_dataset(self, data=None, envs=None, dataset_name=None, eval=False):
        # TODO: Maybe use eval here
        #if self.fpv:
        return SegmentDataset(data=data,
                              env_list=envs,
                              dataset_name=dataset_name,
                              aux_provider_names=[],
                              segment_level=True)
Exemplo n.º 8
0
class LeakyIntegratorGlobalMap(MapTransformerBase):
    def __init__(self,
                 source_map_size,
                 world_size_px,
                 world_size_m,
                 lamda=0.2):
        super(LeakyIntegratorGlobalMap,
              self).__init__(source_map_size, world_size_px, world_size_m)
        self.map_size_px = source_map_size
        self.world_size_px = world_size_px
        self.world_size_m = world_size_m
        self.child_transformer = MapTransformerBase(source_map_size,
                                                    world_size_px,
                                                    world_size_m)
        self.lamda = lamda

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.map_memory = []
        self.coverage_memory = []

        self.dbg_t = None
        self.seq = 0

    def init_weights(self):
        pass

    def reset(self):
        super(LeakyIntegratorGlobalMap, self).reset()
        self.map_memory = []
        self.coverage_memory = []
        self.child_transformer.reset()
        self.seq = 0

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.child_transformer.cuda(device)
        return self

    def dbg_write_extra(self, map, pose):
        if DebugWriter().should_write():
            map = map[0:1, 0:3]
            self.seq += 1
            # Initialize a transformer module
            if pose is not None:
                if self.dbg_t is None:
                    self.dbg_t = MapTransformerBase(
                        self.map_size_px, self.world_size_px,
                        self.world_size_m).to(map.device)

                # Transform the prediction to the global frame and write out to disk.
                self.dbg_t.set_map(map, pose)
                map_global, _ = self.dbg_t.get_map(None)
            else:
                map_global = map
            DebugWriter().write_img(map_global[0],
                                    "gif_overlaid",
                                    args={
                                        "world_size": self.world_size_px,
                                        "name": "sm"
                                    })

    def forward(self,
                images_w,
                coverages_w,
                add_mask=None,
                reset_mask=None,
                show=False):
        #show="li"
        self.prof.tick(".")
        batch_size = len(images_w)

        assert add_mask is None or add_mask[
            0] is not None, "The first observation in a sequence needs to be used!"

        masked_observations_w_add = self.lamda * images_w * coverages_w

        all_maps_out_w = []
        all_coverages_out_w = []

        self.prof.tick("maps_to_global")

        # TODO: Draw past trajectory on an extra channel of the semantic map
        # Step 2: Integrate serially in the global frame
        for i in range(batch_size):
            if len(self.map_memory) == 0 or (reset_mask is not None
                                             and reset_mask[i]):
                new_map_w = images_w[i:i + 1]
                new_map_cov_w = coverages_w[i:i + 1]

            # Allow masking of observations
            elif add_mask is None or add_mask[i]:
                # Get the current global-frame map
                map_g = self.map_memory[-1]
                map_cov_g = self.coverage_memory[-1]
                cov_w = coverages_w[i:i + 1]
                obs_cov_g = masked_observations_w_add[i:i + 1]

                # Add the observation into the map using a leaky integrator rule (TODO: Output lamda from model)
                new_map_cov_w = torch.clamp(map_cov_g + cov_w, 0, 1)
                new_map_w = (1 - self.lamda
                             ) * map_g + obs_cov_g + self.lamda * map_g * (
                                 1 - cov_w)
            else:
                new_map_w = self.map_memory[-1]
                new_map_cov_w = self.coverage_memory[-1]

            self.map_memory.append(new_map_w)
            self.coverage_memory.append(new_map_cov_w)
            all_maps_out_w.append(new_map_w)
            all_coverages_out_w.append(new_map_cov_w)

            #Presenter().show_image(new_map_cov_w.data[0, 0:3], "map_cov", torch=True, scale=8, waitkey=1)
            if show != "":
                Presenter().show_image(new_map_cov_w.data[0, 0:3],
                                       show,
                                       torch=True,
                                       scale=8,
                                       waitkey=1)

        self.prof.tick("integrate")

        # Step 3: Convert all maps to local frame
        all_maps_w = torch.cat(all_maps_out_w, dim=0)
        all_coverages_out_w = torch.cat(all_coverages_out_w, dim=0)

        # Write gifs for debugging
        #self.dbg_write_extra(all_maps_w, None)

        self.prof.tick("maps_to_local")
        self.prof.loop()
        self.prof.print_stats(10)

        return all_maps_w, all_coverages_out_w