示例#1
0
    def __init__(self,
                 source_map_size, world_size_px,
                 world_size, img_w, img_h,
                 embed_size, map_channels, gnd_channels, res_channels=32,
                 lang_filter=False, img_dbg=False):
        super(FPVToEgoMap, self).__init__(source_map_size, world_size_px)

        self.image_debug = img_dbg
        self.use_lang_filter = lang_filter

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels)

        if self.use_lang_filter:
            self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModule(
            source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None
示例#2
0
    def __init__(self,
                 img_in_size=256,
                 world_size_in_img=256,
                 feature_channels=32,
                 ground_channels=3,
                 embed_size=40,
                 aux_ground=False,
                 freeze=False):
        super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img)

        # Process images using a resnet to get a feature map
        self.feature_net = ResNet13Light(feature_channels, down_pad=True)

        self.aux_ground = aux_ground
        if aux_ground:
            self.lang_filter = MapLangSemanticFilter(embed_size,
                                                     feature_channels,
                                                     ground_channels)
            enable_weight_saving(self.lang_filter,
                                 "ground_filter",
                                 alwaysfreeze=freeze)

        enable_weight_saving(self.feature_net,
                             "feature_resnet_light",
                             alwaysfreeze=freeze)
示例#3
0
 def __init__(self, text_embed_size, channels=16, c_out=None):
     super(ResBlockConditional, self).__init__()
     if c_out is None:
         c_out = channels
     self.c_in = channels
     self.c_out = c_out
     if self.c_in != self.c_out:
         print("WARNING: ResBlockConditional is not residual")
     self.lf = MapLangSemanticFilter(text_embed_size, channels, c_out)
示例#4
0
class LangFilterMapProcessor(nn.Module):
    def __init__(self,
                 embed_size,
                 in_channels,
                 out_channels,
                 spatial=False,
                 cat_out=False):
        super(LangFilterMapProcessor, self).__init__()
        self.embed_size = embed_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.cat_out = cat_out

        if spatial:
            self.lang_filter = MapLangSpatialFilter(embed_size, in_channels,
                                                    out_channels)
        else:
            self.lang_filter = MapLangSemanticFilter(embed_size, in_channels,
                                                     out_channels)

    def init_weights(self):
        self.lang_filter.init_weights()

    def forward(self,
                images,
                sentence_embeddings,
                map_poses,
                proc_mask=None,
                show=""):

        # If we are supposed to use less channels than the input map has, just grab the first N channels
        if images.size(1) > self.in_channels:
            images_in = images[:, 0:self.in_channels, :, :]
        else:
            images_in = images

        # Apply the language-conditioned convolutional filter
        self.lang_filter.precompute_conv_weights(sentence_embeddings)
        images_out = self.lang_filter(images_in)

        if show != "":
            Presenter().show_image(images_out.data[0, 0:3],
                                   show,
                                   torch=True,
                                   scale=4,
                                   waitkey=1)

        # If requested, concatenate with the prior input, such that the first feature maps are from output
        # That allows chaining these modules and slicing
        if self.cat_out:
            images_out = torch.cat([images_out, images_in], dim=1)

        return images_out, map_poses
示例#5
0
    def __init__(self,
                 embed_size,
                 in_channels,
                 out_channels,
                 spatial=False,
                 cat_out=False):
        super(LangFilterMapProcessor, self).__init__()
        self.embed_size = embed_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.cat_out = cat_out

        if spatial:
            self.lang_filter = MapLangSpatialFilter(embed_size, in_channels,
                                                    out_channels)
        else:
            self.lang_filter = MapLangSemanticFilter(embed_size, in_channels,
                                                     out_channels)
示例#6
0
class ResBlockConditional(torch.nn.Module):
    def __init__(self, text_embed_size, channels=16, c_out=None):
        super(ResBlockConditional, self).__init__()
        if c_out is None:
            c_out = channels
        self.c_in = channels
        self.c_out = c_out
        if self.c_in != self.c_out:
            print("WARNING: ResBlockConditional is not residual")
        self.lf = MapLangSemanticFilter(text_embed_size, channels, c_out)

    def cuda(self, device=None):
        super(ResBlockConditional, self).cuda()
        self.lf.cuda(device)

    def init_weights(self):
        self.lf.init_weights()

    def forward(self, images, contexts):
        self.lf.precompute_conv_weights(contexts)
        x = self.lf(images)
        if self.c_in == self.c_out:
            out = x + images
        else:
            out = x
        return out
示例#7
0
    def __init__(self,
                 run_name="",
                 aux_class_features=False,
                 aux_grounding_features=False,
                 aux_lang=False,
                 recurrence=False):

        super(ModelGSFPV, self).__init__()
        self.model_name = "gs_fpv" + "_mem" if recurrence else ""
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Auxiliary Objectives
        self.use_aux_class_features = aux_class_features
        self.use_aux_grounding_features = aux_grounding_features
        self.use_aux_lang = aux_lang
        self.use_recurrence = recurrence

        self.img_to_features_w = FPVToFPVMap(self.params["img_w"],
                                             self.params["img_h"],
                                             self.params["resnet_channels"],
                                             self.params["feature_channels"])

        self.lang_filter_gnd = MapLangSemanticFilter(
            self.params["emb_size"], self.params["feature_channels"],
            self.params["relevance_channels"])

        self.lang_filter_goal = MapLangSpatialFilter(
            self.params["emb_size"], self.params["relevance_channels"],
            self.params["goal_channels"])

        self.map_downsample = DownsampleResidual(
            self.params["map_to_act_channels"], 2)

        self.recurrence = RecurrentEmbedding(
            self.params["gs_fpv_feature_map_size"],
            self.params["gs_fpv_recurrence_size"])

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"],
            self.params["emb_layers"])

        in_features_size = self.params[
            "gs_fpv_feature_map_size"] + self.params["emb_size"]
        if self.use_recurrence:
            in_features_size += self.params["gs_fpv_recurrence_size"]

        self.features_to_action = DenseMlpBlock2(in_features_size,
                                                 self.params["mlp_hidden"], 4)

        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        self.add_auxiliary(
            ClassAuxiliary2D("aux_class", None,
                             self.params["feature_channels"],
                             self.params["num_landmarks"], "fpv_features",
                             "lm_pos_fpv", "lm_indices"))
        self.add_auxiliary(
            ClassAuxiliary2D("aux_ground", None,
                             self.params["relevance_channels"], 2,
                             "fpv_features_g", "lm_pos_fpv", "lm_mentioned"))
        if self.params["templates"]:
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_lm", self.params["emb_size"],
                               self.params["num_landmarks"], 1,
                               "sentence_embed", "lm_mentioned_tplt"))
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_side", self.params["emb_size"],
                               self.params["num_sides"], 1, "sentence_embed",
                               "side_mentioned_tplt"))
        else:
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2,
                               self.params["num_landmarks"], "sentence_embed",
                               "lang_lm_mentioned"))

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0
示例#8
0
class ModelGSFPV(nn.Module):
    def __init__(self,
                 run_name="",
                 aux_class_features=False,
                 aux_grounding_features=False,
                 aux_lang=False,
                 recurrence=False):

        super(ModelGSFPV, self).__init__()
        self.model_name = "gs_fpv" + "_mem" if recurrence else ""
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Auxiliary Objectives
        self.use_aux_class_features = aux_class_features
        self.use_aux_grounding_features = aux_grounding_features
        self.use_aux_lang = aux_lang
        self.use_recurrence = recurrence

        self.img_to_features_w = FPVToFPVMap(self.params["img_w"],
                                             self.params["img_h"],
                                             self.params["resnet_channels"],
                                             self.params["feature_channels"])

        self.lang_filter_gnd = MapLangSemanticFilter(
            self.params["emb_size"], self.params["feature_channels"],
            self.params["relevance_channels"])

        self.lang_filter_goal = MapLangSpatialFilter(
            self.params["emb_size"], self.params["relevance_channels"],
            self.params["goal_channels"])

        self.map_downsample = DownsampleResidual(
            self.params["map_to_act_channels"], 2)

        self.recurrence = RecurrentEmbedding(
            self.params["gs_fpv_feature_map_size"],
            self.params["gs_fpv_recurrence_size"])

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"],
            self.params["emb_layers"])

        in_features_size = self.params[
            "gs_fpv_feature_map_size"] + self.params["emb_size"]
        if self.use_recurrence:
            in_features_size += self.params["gs_fpv_recurrence_size"]

        self.features_to_action = DenseMlpBlock2(in_features_size,
                                                 self.params["mlp_hidden"], 4)

        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        self.add_auxiliary(
            ClassAuxiliary2D("aux_class", None,
                             self.params["feature_channels"],
                             self.params["num_landmarks"], "fpv_features",
                             "lm_pos_fpv", "lm_indices"))
        self.add_auxiliary(
            ClassAuxiliary2D("aux_ground", None,
                             self.params["relevance_channels"], 2,
                             "fpv_features_g", "lm_pos_fpv", "lm_mentioned"))
        if self.params["templates"]:
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_lm", self.params["emb_size"],
                               self.params["num_landmarks"], 1,
                               "sentence_embed", "lm_mentioned_tplt"))
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_side", self.params["emb_size"],
                               self.params["num_sides"], 1, "sentence_embed",
                               "side_mentioned_tplt"))
        else:
            self.add_auxiliary(
                ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2,
                               self.params["num_landmarks"], "sentence_embed",
                               "lang_lm_mentioned"))

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0

    # TODO: Try to hide these in a superclass or something. They take up a lot of space:
    def cuda(self, device=None):
        ModuleWithAuxiliaries.cuda(self, device)
        self.sentence_embedding.cuda(device)
        self.img_to_features_w.cuda(device)
        self.lang_filter_gnd.cuda(device)
        self.lang_filter_goal.cuda(device)
        self.action_loss.cuda(device)
        self.recurrence.cuda(device)
        return self

    def get_iter(self):
        return int(self.iter.data[0])

    def inc_iter(self):
        self.iter += 1

    def init_weights(self):
        self.img_to_features_w.init_weights()
        self.lang_filter_gnd.init_weights()
        self.lang_filter_goal.init_weights()
        self.sentence_embedding.init_weights()

    def reset(self):
        # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules
        super(ModelGSFPV, self).reset()
        self.sentence_embedding.reset()
        self.img_to_features_w.reset()
        self.recurrence.reset()
        self.prev_instruction = None
        print("GS_FPV_MEM_RESET")

    def setEnvContext(self, context):
        print("Set env context to: " + str(context))
        self.env_id = context["env_id"]

    def start_segment_rollout(self, *args):
        self.reset()

    def get_action(self, state, instruction):
        """
        Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop)
        :param state: DroneState object with the raw image from the simulator
        :param instruction: Tokenized instruction given the corpus
        #TODO: Absorb corpus within model
        :return:
        """
        # TODO: Simplify this
        self.eval()
        images_np_pure = state.image
        state_np = state.state

        #print("Act: " + debug_untokenize_instruction(instruction))

        images_np = standardize_image(images_np_pure)
        image_fpv = Variable(none_padded_seq_to_tensor([images_np]))
        state = Variable(none_padded_seq_to_tensor([state_np]))
        self.prev_instruction = instruction

        img_in_t = image_fpv
        img_in_t.volatile = True

        instr_len = [len(instruction)] if instruction is not None else None
        instruction = torch.LongTensor(instruction).unsqueeze(0)
        instruction = cuda_var(instruction, self.is_cuda, self.cuda_device)

        state.volatile = True

        if self.is_cuda:
            img_in_t = img_in_t.cuda(self.cuda_device)
            state = state.cuda(self.cuda_device)

        self.seq_step += 1

        action = self(img_in_t, state, instruction, instr_len)

        output_action = action.squeeze().data.cpu().numpy()
        print("action: ", output_action)

        stop_prob = output_action[3]
        output_stop = 1 if stop_prob > self.params["stop_threshold"] else 0
        output_action[3] = output_stop

        return output_action

    def deterministic_action(self, action_mean, action_std, stop_prob):
        batch_size = action_mean.size(0)
        action = Variable(
            empty_float_tensor((batch_size, 4), self.is_cuda,
                               self.cuda_device))
        action[:, 0:3] = action_mean[:, 0:3]
        action[:, 3] = stop_prob
        return action

    def sample_action(self, action_mean, action_std, stop_prob):
        action = torch.normal(action_mean, action_std)
        stop = torch.bernoulli(stop_prob)
        return action, stop

    # This is called before beginning an execution sequence
    def start_sequence(self):
        self.seq_step = 0
        self.reset()
        print("RESETTED!")
        return

    # TODO: Move this somewhere and standardize
    def cam_poses_from_states(self, states):
        cam_pos = states[:, 9:12]
        cam_rot = states[:, 12:16]
        pose = Pose(cam_pos, cam_rot)
        return pose

    def forward(self, images, states, instructions, instr_lengths):
        """
        :param images: BxCxHxW batch of images (observations)
        :param states: BxK batch of drone states
        :param instructions: BxM LongTensor where M is the maximum length of any instruction
        :param instr_lengths: list of len B of integers, indicating length of each instruction
        :param has_obs: list of booleans of length B indicating whether the given element in the sequence has an observation
        :param yield_semantic_maps: If true, will not compute actions (full model), but return the semantic maps that
            were built along the way in response to the images. This is ugly, but allows code reuse
        :return:
        """
        cam_poses = self.cam_poses_from_states(states)
        self.prof.tick("out")

        #print("Trn: " + debug_untokenize_instruction(instructions[0].data[:instr_lengths[0]]))

        # Calculate the instruction embedding
        if instructions is not None:
            # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state
            sent_embeddings = self.sentence_embedding(instructions,
                                                      instr_lengths)
            self.keep_inputs("sentence_embed", sent_embeddings)
        else:
            sent_embeddings = self.sentence_embedding.get()

        self.prof.tick("embed")

        seq_size = len(images)

        # Extract and project features onto the egocentric frame for each image
        fpv_features = self.img_to_features_w(images,
                                              cam_poses,
                                              sent_embeddings,
                                              self,
                                              show="")

        self.keep_inputs("fpv_features", fpv_features)
        self.prof.tick("img_to_map_frame")

        self.lang_filter_gnd.precompute_conv_weights(sent_embeddings)
        self.lang_filter_goal.precompute_conv_weights(sent_embeddings)

        gnd_features = self.lang_filter_gnd(fpv_features)
        goal_features = self.lang_filter_goal(gnd_features)

        self.keep_inputs("fpv_features_g", gnd_features)
        visual_features = torch.cat([gnd_features, goal_features], dim=1)

        lstm_in_features = visual_features.view([seq_size, 1, -1])

        catlist = [lstm_in_features.view([seq_size, -1]), sent_embeddings]

        if self.use_recurrence:
            memory_features = self.recurrence(lstm_in_features)
            catlist.append(memory_features[:, 0, :])

        action_features = torch.cat(catlist, dim=1)

        # Output the final action given the processed map
        action_pred = self.features_to_action(action_features)
        action_pred[:, 3] = torch.sigmoid(action_pred[:, 3])
        out_action = self.deterministic_action(action_pred[:, 0:3], None,
                                               action_pred[:, 3])
        self.prof.tick("map_to_action")

        return out_action

    def maybe_cuda(self, tensor):
        if self.is_cuda:
            return tensor.cuda()
        else:
            return tensor

    def cuda_var(self, tensor):
        return cuda_var(tensor, self.is_cuda, self.cuda_device)

    # Forward pass for training (with batch optimizations
    def sup_loss_on_batch(self, batch, eval):
        self.prof.tick("out")

        action_loss_total = Variable(
            empty_float_tensor([1], self.is_cuda, self.cuda_device))

        if batch is None:
            print("Skipping None Batch")
            return action_loss_total

        images = self.maybe_cuda(batch["images"])

        instructions = self.maybe_cuda(batch["instr"])
        instr_lengths = batch["instr_len"]
        states = self.maybe_cuda(batch["states"])
        actions = self.maybe_cuda(batch["actions"])

        # Auxiliary labels
        lm_pos_fpv = batch["lm_pos_fpv"]
        lm_indices = batch["lm_indices"]
        lm_mentioned = batch["lm_mentioned"]
        lang_lm_mentioned = batch["lang_lm_mentioned"]

        templates = get_current_parameters()["Environment"]["Templates"]
        if templates:
            lm_mentioned_tplt = batch["lm_mentioned_tplt"]
            side_mentioned_tplt = batch["side_mentioned_tplt"]

        # stops = self.maybe_cuda(batch["stops"])
        masks = self.maybe_cuda(batch["masks"])
        metadata = batch["md"]

        seq_len = images.size(1)
        batch_size = images.size(0)
        count = 0
        correct_goal_count = 0
        goal_count = 0

        # Loop thru batch
        for b in range(batch_size):
            seg_idx = -1

            self.reset()

            self.prof.tick("out")
            b_seq_len = len_until_nones(metadata[b])

            # TODO: Generalize this
            # Slice the data according to the sequence length
            b_metadata = metadata[b][:b_seq_len]
            b_images = images[b][:b_seq_len]
            b_instructions = instructions[b][:b_seq_len]
            b_instr_len = instr_lengths[b][:b_seq_len]
            b_states = states[b][:b_seq_len]
            b_actions = actions[b][:b_seq_len]
            b_lm_pos_fpv = lm_pos_fpv[b][:b_seq_len]
            b_lm_indices = lm_indices[b][:b_seq_len]
            b_lm_mentioned = lm_mentioned[b][:b_seq_len]

            b_lm_pos_fpv = [
                self.cuda_var(
                    (s / RESNET_FACTOR).long()) if s is not None else None
                for s in b_lm_pos_fpv
            ]
            b_lm_indices = [
                self.cuda_var(s) if s is not None else None
                for s in b_lm_indices
            ]
            b_lm_mentioned = [
                self.cuda_var(s) if s is not None else None
                for s in b_lm_mentioned
            ]

            # TODO: Figure out how to keep these properly. Perhaps as a whole batch is best
            # TODO: Introduce a key-value store (encapsulate instead of inherit)
            self.keep_inputs("lm_pos_fpv", b_lm_pos_fpv)
            self.keep_inputs("lm_indices", b_lm_indices)
            self.keep_inputs("lm_mentioned", b_lm_mentioned)

            # TODO: Abstract all of these if-elses in a modular way once we know which ones are necessary
            if templates:
                b_lm_mentioned_tplt = lm_mentioned_tplt[b][:b_seq_len]
                b_side_mentioned_tplt = side_mentioned_tplt[b][:b_seq_len]
                b_side_mentioned_tplt = self.cuda_var(b_side_mentioned_tplt)
                b_lm_mentioned_tplt = self.cuda_var(b_lm_mentioned_tplt)
                self.keep_inputs("lm_mentioned_tplt", b_lm_mentioned_tplt)
                self.keep_inputs("side_mentioned_tplt", b_side_mentioned_tplt)
            else:
                b_lang_lm_mentioned = self.cuda_var(
                    lang_lm_mentioned[b][:b_seq_len])
                self.keep_inputs("lang_lm_mentioned", b_lang_lm_mentioned)

            # ----------------------------------------------------------------------------

            self.prof.tick("inputs")

            actions = self(b_images, b_states, b_instructions, b_instr_len)

            action_losses, _ = self.action_loss(b_actions,
                                                actions,
                                                batchreduce=False)

            self.prof.tick("call")
            action_losses = self.action_loss.batch_reduce_loss(action_losses)
            action_loss = self.action_loss.reduce_loss(action_losses)
            action_loss_total = action_loss
            count += b_seq_len

            self.prof.tick("loss")

        action_loss_avg = action_loss_total / (count + 1e-9)

        self.prof.tick("out")

        # Doing this in the end (outside of se
        aux_losses = self.calculate_aux_loss(reduce_average=True)
        aux_loss = self.combine_aux_losses(aux_losses, self.aux_weights)

        prefix = self.model_name + ("/eval" if eval else "/train")

        self.writer.add_dict(prefix, get_current_meters(), self.get_iter())
        self.writer.add_dict(prefix, aux_losses, self.get_iter())
        self.writer.add_scalar(prefix + "/action_loss",
                               action_loss_avg.data.cpu()[0], self.get_iter())

        self.prof.tick("auxiliaries")

        total_loss = action_loss_avg + aux_loss

        self.inc_iter()

        self.prof.tick("summaries")
        self.prof.loop()
        self.prof.print_stats(1)

        return total_loss

    def get_dataset(self,
                    data=None,
                    envs=None,
                    dataset_names=None,
                    dataset_prefix=None,
                    eval=False):
        # TODO: Maybe use eval here
        #if self.fpv:
        data_sources = []
        data_sources.append(aup.PROVIDER_LM_POS_DATA)
        data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED)

        templates = get_current_parameters()["Environment"]["Templates"]
        if templates:
            data_sources.append(aup.PROVIDER_LANG_TEMPLATE)

        return SegmentDataset(data=data,
                              env_list=envs,
                              dataset_names=dataset_names,
                              dataset_prefix=dataset_prefix,
                              aux_provider_names=data_sources,
                              segment_level=True)
示例#9
0
class FPVToEgoMap(MapTransformerBase):
    def __init__(self,
                 source_map_size, world_size_px,
                 world_size, img_w, img_h,
                 embed_size, map_channels, gnd_channels, res_channels=32,
                 lang_filter=False, img_dbg=False):
        super(FPVToEgoMap, self).__init__(source_map_size, world_size_px)

        self.image_debug = img_dbg
        self.use_lang_filter = lang_filter

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels)

        if self.use_lang_filter:
            self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModule(
            source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.map_projection.cuda(device)
        self.grid_sampler.cuda(device)
        self.img_to_features.cuda(device)
        if self.use_lang_filter:
            self.lang_filter.cuda(device)

    def init_weights(self):
        if not self.image_debug:
            self.img_to_features.init_weights()

    def reset(self):
        self.actual_images = None
        super(FPVToEgoMap, self).reset()

    def forward_fpv_features(self, images, sentence_embeds, parent=None):
        """
        Compute the first-person image features given the first-person images
        If grounding loss is enabled, will also return sentence_embedding conditioned image features
        :param images: images to compute features on
        :param sentence_embeds: sentence embeddings for each image
        :param parent:
        :return: features_fpv_vis - the visual features extracted using the ResNet
                 features_fpv_gnd - the grounded visual features obtained after applying a 1x1 language-conditioned conv
        """
        # Extract image features. If they've been precomputed ahead of time, just grab it by the provided index
        features_fpv_vis = self.img_to_features(images)

        if parent is not None:
            parent.keep_inputs("fpv_features", features_fpv_vis)
        self.prof.tick("feat")

        # If required, pre-process image features by grounding them in language
        if self.use_lang_filter:
            self.lang_filter.precompute_conv_weights(sentence_embeds)
            features_gnd = self.lang_filter(features_fpv_vis)
            if parent is not None:
                parent.keep_inputs("fpv_features_g", features_gnd)
            self.prof.tick("gnd")
            return features_fpv_vis, features_gnd

        return features_fpv_vis, None

    def forward(self, images, poses, sentence_embeds, parent=None, show=""):

        self.prof.tick("out")

        features_fpv_vis_only, features_fpv_gnd_only = self.forward_fpv_features(images, sentence_embeds, parent)

        # If we have grounding features, the overall features are a concatenation of grounded and non-grounded features
        if features_fpv_gnd_only is not None:
            features_fpv_all = torch.cat([features_fpv_gnd_only, features_fpv_vis_only], dim=1)
        else:
            features_fpv_all = features_fpv_vis_only

        # Project first-person view features on to the map in egocentric frame
        grid_maps = self.map_projection(poses)
        self.prof.tick("proj_map")
        features_r = self.grid_sampler(features_fpv_all, grid_maps)

        # Obtain an ego-centric map mask of where we have new information
        ones_size = list(features_fpv_all.size())
        ones_size[1] = 1
        tmp_ones = empty_float_tensor(ones_size, self.is_cuda, self.cuda_device).fill_(1.0)
        new_coverages = self.grid_sampler(tmp_ones, grid_maps)

        # Make sure that new_coverage is a 0/1 mask (grid_sampler applies bilinear interpolation)
        new_coverages = new_coverages - torch.min(new_coverages)
        new_coverages = new_coverages / torch.max(new_coverages)

        self.prof.tick("gsample")

        if show != "":
            Presenter().show_image(images.data[0, 0:3], show + "_img", torch=True, scale=1, waitkey=1)
            Presenter().show_image(features_r.data[0, 0:3], show, torch=True, scale=6, waitkey=1)
            Presenter().show_image(new_coverages.data[0], show + "_covg", torch=True, scale=6, waitkey=1)

        self.prof.loop()
        self.prof.print_stats(10)

        return features_r, new_coverages
示例#10
0
    def __init__(self,
                 run_name,
                 ignore_lang=False,
                 class_loss=True,
                 ground_loss=True):
        super(ModelTopDownPathGoalPredictor, self).__init__()
        self.run_name = run_name
        self.model_name = "top_down_path_pred_pretrain"
        self.writer = SummaryWriter(log_dir="runs/" + run_name)

        self.ignore_lang = ignore_lang
        self.class_loss = class_loss
        self.ground_loss = ground_loss

        # The feature net extracts the 2D feature map from the input image.
        # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image
        # The output predicted labels are the size of the feature map
        self.feature_net = ResNet13Light(32, down_pad=True)
        self.label_pool = nn.MaxPool2d(8)

        if self.ground_loss:
            self.lang_filter = MapLangSemanticFilter(sentence_embedding_size,
                                                     32, 3)
            self.aux_ground_linear = nn.Linear(3, 2)
            enable_weight_saving(self.lang_filter, "ground_filter")
            enable_weight_saving(self.aux_ground_linear, "ground_aux_linear")

        if RESNET:
            self.unet = ResNetConditional(sentence_embedding_size, 35, 2)
        else:
            unet_c_in = 35 if self.ground_loss else 32
            unet_hc1 = 48 if self.ground_loss else 48
            unet_hb1 = 24 if self.ground_loss else 24
            self.unet = Unet5ContextualBneck(unet_c_in,
                                             2,
                                             sentence_embedding_size,
                                             hc1=unet_hc1,
                                             hb1=unet_hb1,
                                             hc2=128,
                                             split_embedding=splitemb)

        if attention:
            self.sentence_embedding = SentenceEmbeddingSelfAttention(
                word_embedding_size,
                lstm_size,
                sentence_embedding_layers,
                attention_heads=attention_heads)
        else:
            self.sentence_embedding = SentenceEmbeddingSimple(
                word_embedding_size, sentence_embedding_size,
                sentence_embedding_layers)

        self.gather2d = Gather2D()

        if self.class_loss:
            self.aux_class_linear = nn.Linear(32, 64)
            enable_weight_saving(self.aux_class_linear, "class_aux_linear")

        print("Sentence Embedding #Params: ",
              get_n_params(self.sentence_embedding))
        print("U-Net #Params: ", get_n_params(self.unet))
        print("Class auxiliary: ", self.class_loss)
        print("Ground auxiliary: ", self.ground_loss)

        # Enable saving of pre-trained weights
        enable_weight_saving(self.feature_net, "feature_resnet_light")
        enable_weight_saving(self.unet, "unet")
        enable_weight_saving(self.sentence_embedding, "sentence_embedding")

        if NLL:
            #self.mask_loss = nn.BCELoss()
            self.mask_loss = nn.NLLLoss2d()
        elif BCE:
            self.mask_loss = nn.BCEWithLogitsLoss()
        elif CE:
            self.spatialsoftmax = SpatialSoftmax2d()
            self.mask_loss = CrossEntropy2d()
        else:
            self.mask_loss = nn.MSELoss()

        self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True)
        self.epoch_numbers = {"train": 0, "eval": 0}
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.dropout = nn.Dropout(0.5)
        self.dropout2d = nn.Dropout2d(0.5)
        self.dropout3d = nn.Dropout3d(0.5)

        self.viz_images = []
        self.instructions = []
示例#11
0
class ModelTopDownPathGoalPredictor(CudaModule):
    def __init__(self,
                 run_name,
                 ignore_lang=False,
                 class_loss=True,
                 ground_loss=True):
        super(ModelTopDownPathGoalPredictor, self).__init__()
        self.run_name = run_name
        self.model_name = "top_down_path_pred_pretrain"
        self.writer = SummaryWriter(log_dir="runs/" + run_name)

        self.ignore_lang = ignore_lang
        self.class_loss = class_loss
        self.ground_loss = ground_loss

        # The feature net extracts the 2D feature map from the input image.
        # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image
        # The output predicted labels are the size of the feature map
        self.feature_net = ResNet13Light(32, down_pad=True)
        self.label_pool = nn.MaxPool2d(8)

        if self.ground_loss:
            self.lang_filter = MapLangSemanticFilter(sentence_embedding_size,
                                                     32, 3)
            self.aux_ground_linear = nn.Linear(3, 2)
            enable_weight_saving(self.lang_filter, "ground_filter")
            enable_weight_saving(self.aux_ground_linear, "ground_aux_linear")

        if RESNET:
            self.unet = ResNetConditional(sentence_embedding_size, 35, 2)
        else:
            unet_c_in = 35 if self.ground_loss else 32
            unet_hc1 = 48 if self.ground_loss else 48
            unet_hb1 = 24 if self.ground_loss else 24
            self.unet = Unet5ContextualBneck(unet_c_in,
                                             2,
                                             sentence_embedding_size,
                                             hc1=unet_hc1,
                                             hb1=unet_hb1,
                                             hc2=128,
                                             split_embedding=splitemb)

        if attention:
            self.sentence_embedding = SentenceEmbeddingSelfAttention(
                word_embedding_size,
                lstm_size,
                sentence_embedding_layers,
                attention_heads=attention_heads)
        else:
            self.sentence_embedding = SentenceEmbeddingSimple(
                word_embedding_size, sentence_embedding_size,
                sentence_embedding_layers)

        self.gather2d = Gather2D()

        if self.class_loss:
            self.aux_class_linear = nn.Linear(32, 64)
            enable_weight_saving(self.aux_class_linear, "class_aux_linear")

        print("Sentence Embedding #Params: ",
              get_n_params(self.sentence_embedding))
        print("U-Net #Params: ", get_n_params(self.unet))
        print("Class auxiliary: ", self.class_loss)
        print("Ground auxiliary: ", self.ground_loss)

        # Enable saving of pre-trained weights
        enable_weight_saving(self.feature_net, "feature_resnet_light")
        enable_weight_saving(self.unet, "unet")
        enable_weight_saving(self.sentence_embedding, "sentence_embedding")

        if NLL:
            #self.mask_loss = nn.BCELoss()
            self.mask_loss = nn.NLLLoss2d()
        elif BCE:
            self.mask_loss = nn.BCEWithLogitsLoss()
        elif CE:
            self.spatialsoftmax = SpatialSoftmax2d()
            self.mask_loss = CrossEntropy2d()
        else:
            self.mask_loss = nn.MSELoss()

        self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True)
        self.epoch_numbers = {"train": 0, "eval": 0}
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.dropout = nn.Dropout(0.5)
        self.dropout2d = nn.Dropout2d(0.5)
        self.dropout3d = nn.Dropout3d(0.5)

        self.viz_images = []
        self.instructions = []

    def get_iter(self):
        return int(self.iter.data[0])

    def inc_iter(self):
        self.iter += 1

    def init_weights(self):
        self.sentence_embedding.init_weights()
        self.unet.init_weights()
        if self.ground_loss:
            self.aux_ground_linear.weight.data.normal_(0.001)
            self.aux_ground_linear.bias.data.fill_(0)
        if self.class_loss:
            self.aux_class_linear.weight.data.normal_(0.001)
            self.aux_class_linear.bias.data.fill_(0)

    def cuda(self, device=None):
        CudaModule.cuda(self, device)
        self.sentence_embedding.cuda(device)
        self.unet.cuda(device)
        if self.ground_loss:
            self.lang_filter.cuda(device)
        return self

    def write_eoe_summaries(self, inference_type, epoch_num):
        pass

    def write_summaires(self, prefix, idx, total_loss, main_loss, emb_loss,
                        class_loss, gnd_loss):
        full_prefix = self.model_name + "/" + prefix + "/"
        if self.writer is None:
            return

        self.writer.add_scalar(full_prefix + "total_loss", total_loss.data[0],
                               idx)
        self.writer.add_scalar(full_prefix + "main_loss", main_loss.data[0],
                               idx)
        self.writer.add_scalar(full_prefix + "class_loss", class_loss.data[0],
                               idx)
        if class_loss is not None:
            self.writer.add_scalar(full_prefix + "emb_loss", emb_loss.data[0],
                                   idx)
        if gnd_loss is not None:
            self.writer.add_scalar(full_prefix + "gnd_loss", gnd_loss.data[0],
                                   idx)

    def get_dataset(self,
                    data=None,
                    envs=None,
                    eval=False,
                    dataset_name=None,
                    seg_level=True):
        return TopDownDataset(env_list=envs,
                              instr_negatives=False,
                              instr_negatives_similar_only=False,
                              seg_level=seg_level,
                              yaw_rand_range=0.0 if eval else YAW_RANGE,
                              img_w=512,
                              img_h=512,
                              map_w=256,
                              map_h=256,
                              incl_path=True,
                              incl_endpoint=True)

    def get_viz(self):
        presenter = Presenter()
        out = {"viz_img": []}
        for i, img in enumerate(self.viz_images):
            instruction = self.instructions[i]
            if len(instruction.view([-1])) < 2:
                instruction = [0]
            else:
                instruction = list(instruction.data.cpu().numpy().squeeze())
            instruction_str = debug_untokenize_instruction(instruction)
            viz_img = presenter.overlay_text(img, instruction_str)
            out["viz_img"].append(viz_img)
        return out

    def forward(self, images, instructions, instruction_masks):
        emb = self.sentence_embedding(instructions,
                                      torch.sum(instruction_masks, 1))

        # If the embedding returns an internal auxiliary, loss, pass it along
        emb_loss = cuda_var(torch.zeros([1]), self.is_cuda, self.cuda_device)
        if type(emb) is tuple:
            emb, emb_loss = emb

        feature_map = self.feature_net(images)
        feature_map = self.dropout2d(feature_map)

        if self.ground_loss:
            self.lang_filter.precompute_conv_weights(emb)
            ground_map = self.lang_filter(feature_map)
            feature_map = torch.cat([feature_map, ground_map], dim=1)

        # TODO: Testing breaking of gradients between ResNet and UNet
        if cut_gradients:
            feature_map_fwd = Variable(feature_map.data)
        else:
            feature_map_fwd = feature_map

        #if self.ground_loss:
        #    feature_map_fwd = feature_map_fwd[:, 0:3, :, :]

        pred_mask = self.unet(feature_map_fwd, emb)

        return pred_mask, feature_map, emb_loss

    def sup_loss_on_batch(self, batch, eval=False, viz=False):

        if eval:
            self.eval()
        else:
            self.train()

        images = cuda_var(batch["images"], self.is_cuda, self.cuda_device)
        instructions = cuda_var(batch["instr"], self.is_cuda, self.cuda_device)
        instruction_masks = cuda_var(batch["instr_mask"], self.is_cuda,
                                     self.cuda_device)
        label_masks = cuda_var(batch["traj_labels"], self.is_cuda,
                               self.cuda_device)

        # Each of the above is a list of lists of tensors, where the outer list is over the batch and the inner list
        # is over the segments. Loop through and accumulate loss for each batch sequentially, and for each segment.
        # Reset model state (embedding etc) between batches, but not between segments.
        # We don't process each batch in batch-mode, because it's complicated, with the varying number of segments and all.

        batch_size = len(images)
        total_class_loss = Variable(empty_float_tensor([1], self.is_cuda,
                                                       self.cuda_device),
                                    requires_grad=True)
        total_ground_loss = Variable(empty_float_tensor([1], self.is_cuda,
                                                        self.cuda_device),
                                     requires_grad=True)
        count = 0

        label_masks = self.label_pool(label_masks)
        mask_pred, features, emb_loss = self(images, instructions,
                                             instruction_masks)

        if BCE:
            mask_pred_flat = mask_pred.view(-1, 1)
            label_masks_flat = label_masks - torch.min(label_masks)
            label_masks_flat = label_masks_flat / (
                torch.max(label_masks_flat) + 1e-9)
            label_masks_flat = label_masks_flat.view(-1, 1).clamp(0, 1)
            main_loss = self.mask_loss(mask_pred_flat, label_masks_flat)

        elif NLL:
            mask_pred_1 = F.softmax(mask_pred, 1, _stacklevel=5)
            mask_pred_2 = 1 - mask_pred_1
            mask_pred_1 = mask_pred_1.unsqueeze(1)
            mask_pred_2 = mask_pred_2.unsqueeze(1)
            mask_pred = torch.cat((mask_pred_1, mask_pred_2), dim=1)
            label_masks = label_masks.clamp(0, 1)
            if self.is_cuda:
                label_masks = label_masks.type(torch.cuda.LongTensor)
            else:
                label_masks = label_masks.type(torch.LongTensor)
            main_loss = self.mask_loss(mask_pred, label_masks)

        elif CE:
            # Crossentropy2D internally applies logsoftmax to mask_pred,
            # but labels are already assumed to be a valid probability distribution, so no softmax is applied
            main_loss = self.mask_loss(mask_pred, label_masks)
            # So for nice plotting, we must manually do it
            mask_pred = self.spatialsoftmax(mask_pred)
        else:
            main_loss = self.mask_loss(mask_pred, label_masks)

        # sum emb loss if batch size > 1
        if type(emb_loss) == tuple:
            emb_loss = sum(emb_loss)

        # Extract the feature vectors corresponding to every landmark's location in the map
        # Apply a linear layer to classify which of the 64 landmarks it is
        # The landmark positions have to be divided by the same factor as the ResNet scaling factor
        lcount = 0
        for i in range(batch_size):
            if self.class_loss and len(batch["lm_pos"][i]) > 0:
                lcount += 1
                landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda,
                                        self.cuda_device)
                landmark_indices = cuda_var(batch["lm_indices"][i],
                                            self.is_cuda, self.cuda_device)
                landmark_coords = (landmark_pos / 8).long()
                lm_features = self.gather2d(features[i:i + 1, 0:32],
                                            landmark_coords)
                lm_pred = self.aux_class_linear(lm_features)
                class_loss = self.aux_loss(lm_pred, landmark_indices)
                total_class_loss = total_class_loss + class_loss

            if self.ground_loss and len(batch["lm_pos"][i]) > 0:
                landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda,
                                        self.cuda_device)
                landmark_mentioned = cuda_var(batch["lm_mentioned"][i],
                                              self.is_cuda, self.cuda_device)
                landmark_coords = (landmark_pos / 8).long()
                g_features = self.gather2d(features[i:i + 1, 32:35],
                                           landmark_coords)
                lm_pred = self.aux_ground_linear(g_features)
                ground_loss = self.aux_loss(lm_pred, landmark_mentioned)
                total_ground_loss = total_ground_loss + ground_loss

        total_class_loss = total_class_loss / (lcount + 1e-9)
        total_ground_loss = total_ground_loss / (lcount + 1e-9)
        count += 1

        # Just visualization and debugging code
        if self.get_iter() % 50 == 0:
            presenter = Presenter()
            pred_viz_np = presenter.overlaid_image(images[0].data,
                                                   mask_pred[0].data)
            labl_viz_np = presenter.overlaid_image(images[0].data,
                                                   label_masks[0].data)
            comp = np.concatenate((pred_viz_np, labl_viz_np), axis=1)
            presenter.show_image(comp, "path_pred")

            if hasattr(self.sentence_embedding, "save_att_map"):
                self.sentence_embedding.save_att_map(self.get_iter(), i)

        total_loss = main_loss + 0.1 * total_class_loss + 0.001 * emb_loss + 0.1 * total_ground_loss
        total_loss = total_loss / (count + 1e-9)

        self.write_summaires("eval" if eval else "train", self.get_iter(),
                             total_loss, main_loss, emb_loss, total_class_loss,
                             total_ground_loss)
        self.inc_iter()

        return total_loss
示例#12
0
class TopDownToEgoMap(MapTransformerBase):
    def __init__(self,
                 img_in_size=256,
                 world_size_in_img=256,
                 feature_channels=32,
                 ground_channels=3,
                 embed_size=40,
                 aux_ground=False,
                 freeze=False):
        super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img)

        # Process images using a resnet to get a feature map
        self.feature_net = ResNet13Light(feature_channels, down_pad=True)

        self.aux_ground = aux_ground
        if aux_ground:
            self.lang_filter = MapLangSemanticFilter(embed_size,
                                                     feature_channels,
                                                     ground_channels)
            enable_weight_saving(self.lang_filter,
                                 "ground_filter",
                                 alwaysfreeze=freeze)

        enable_weight_saving(self.feature_net,
                             "feature_resnet_light",
                             alwaysfreeze=freeze)

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.map_affine.cuda(device)
        if self.aux_ground:
            self.lang_filter.cuda(device)
        return self

    def init_weights(self):
        self.feature_net.init_weights()

    def forward(self, image_g, pose, sentence_embed, parent=None, show=""):

        # scale to 0-1 range
        #image_g = image_g - torch.min(image_g)
        #image_g = image_g / (torch.max(image_g) + 1e-9)

        # rotate to robot frame
        # TODO: Temporarily changed to local pose
        self.set_map(image_g, pose)
        image_r, _ = self.get_map(pose)
        """
        # normalize mean-0 std-1
        image_r = image_r - torch.mean(image_r)
        image_r = image_r / (torch.std(image_r) + 1e-9)

        ones = torch.ones_like(image_g)
        self.set_map(ones, None)
        cov_r, _ = self.get_map(pose)
        cov_r = cov_r - torch.min(cov_r)
        cov_r /= (torch.max(cov_r) + 1e-9)
        cov_rl = cov_r > 1e-8

        blackcolor = torch.min(image_g)

        #image_r[cov_rl] = blackcolor
        """

        features_r = self.feature_net(image_r)

        if parent is not None:
            parent.keep_inputs("fpv_features", features_r)

        if self.aux_ground:
            self.lang_filter.precompute_conv_weights(sentence_embed)
            features_g = self.lang_filter(features_r)
            if parent is not None:
                parent.keep_inputs("fpv_features_g", features_g)

            features_all = torch.cat([features_g, features_r], dim=1)
        else:
            features_all = features_r

        coverage = torch.ones_like(features_all)

        if show != "":
            Presenter().show_image(image_r.data[0, 0:3],
                                   show + "_img",
                                   torch=True,
                                   scale=1,
                                   waitkey=20)
            Presenter().show_image(features_r.data[0, 0:3],
                                   show,
                                   torch=True,
                                   scale=12,
                                   waitkey=20)
            #Presenter().show_image(cov_r.data[0, 0:3], show+ "_convg", torch=True, scale=1, waitkey=20)

        return features_all, coverage