def __init__(self, source_map_size, world_size_px, world_size, img_w, img_h, embed_size, map_channels, gnd_channels, res_channels=32, lang_filter=False, img_dbg=False): super(FPVToEgoMap, self).__init__(source_map_size, world_size_px) self.image_debug = img_dbg self.use_lang_filter = lang_filter # Process images using a resnet to get a feature map if self.image_debug: self.img_to_features = nn.MaxPool2d(8) else: # Provide enough padding so that the map is scaled down by powers of 2. self.img_to_features = ImgToFeatures(res_channels, map_channels) if self.use_lang_filter: self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels) # Project feature maps to the global frame self.map_projection = PinholeCameraProjectionModule( source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h) self.grid_sampler = GridSampler() self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.actual_images = None
def __init__(self, img_in_size=256, world_size_in_img=256, feature_channels=32, ground_channels=3, embed_size=40, aux_ground=False, freeze=False): super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img) # Process images using a resnet to get a feature map self.feature_net = ResNet13Light(feature_channels, down_pad=True) self.aux_ground = aux_ground if aux_ground: self.lang_filter = MapLangSemanticFilter(embed_size, feature_channels, ground_channels) enable_weight_saving(self.lang_filter, "ground_filter", alwaysfreeze=freeze) enable_weight_saving(self.feature_net, "feature_resnet_light", alwaysfreeze=freeze)
def __init__(self, text_embed_size, channels=16, c_out=None): super(ResBlockConditional, self).__init__() if c_out is None: c_out = channels self.c_in = channels self.c_out = c_out if self.c_in != self.c_out: print("WARNING: ResBlockConditional is not residual") self.lf = MapLangSemanticFilter(text_embed_size, channels, c_out)
class LangFilterMapProcessor(nn.Module): def __init__(self, embed_size, in_channels, out_channels, spatial=False, cat_out=False): super(LangFilterMapProcessor, self).__init__() self.embed_size = embed_size self.in_channels = in_channels self.out_channels = out_channels self.cat_out = cat_out if spatial: self.lang_filter = MapLangSpatialFilter(embed_size, in_channels, out_channels) else: self.lang_filter = MapLangSemanticFilter(embed_size, in_channels, out_channels) def init_weights(self): self.lang_filter.init_weights() def forward(self, images, sentence_embeddings, map_poses, proc_mask=None, show=""): # If we are supposed to use less channels than the input map has, just grab the first N channels if images.size(1) > self.in_channels: images_in = images[:, 0:self.in_channels, :, :] else: images_in = images # Apply the language-conditioned convolutional filter self.lang_filter.precompute_conv_weights(sentence_embeddings) images_out = self.lang_filter(images_in) if show != "": Presenter().show_image(images_out.data[0, 0:3], show, torch=True, scale=4, waitkey=1) # If requested, concatenate with the prior input, such that the first feature maps are from output # That allows chaining these modules and slicing if self.cat_out: images_out = torch.cat([images_out, images_in], dim=1) return images_out, map_poses
def __init__(self, embed_size, in_channels, out_channels, spatial=False, cat_out=False): super(LangFilterMapProcessor, self).__init__() self.embed_size = embed_size self.in_channels = in_channels self.out_channels = out_channels self.cat_out = cat_out if spatial: self.lang_filter = MapLangSpatialFilter(embed_size, in_channels, out_channels) else: self.lang_filter = MapLangSemanticFilter(embed_size, in_channels, out_channels)
class ResBlockConditional(torch.nn.Module): def __init__(self, text_embed_size, channels=16, c_out=None): super(ResBlockConditional, self).__init__() if c_out is None: c_out = channels self.c_in = channels self.c_out = c_out if self.c_in != self.c_out: print("WARNING: ResBlockConditional is not residual") self.lf = MapLangSemanticFilter(text_embed_size, channels, c_out) def cuda(self, device=None): super(ResBlockConditional, self).cuda() self.lf.cuda(device) def init_weights(self): self.lf.init_weights() def forward(self, images, contexts): self.lf.precompute_conv_weights(contexts) x = self.lf(images) if self.c_in == self.c_out: out = x + images else: out = x return out
def __init__(self, run_name="", aux_class_features=False, aux_grounding_features=False, aux_lang=False, recurrence=False): super(ModelGSFPV, self).__init__() self.model_name = "gs_fpv" + "_mem" if recurrence else "" self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Auxiliary Objectives self.use_aux_class_features = aux_class_features self.use_aux_grounding_features = aux_grounding_features self.use_aux_lang = aux_lang self.use_recurrence = recurrence self.img_to_features_w = FPVToFPVMap(self.params["img_w"], self.params["img_h"], self.params["resnet_channels"], self.params["feature_channels"]) self.lang_filter_gnd = MapLangSemanticFilter( self.params["emb_size"], self.params["feature_channels"], self.params["relevance_channels"]) self.lang_filter_goal = MapLangSpatialFilter( self.params["emb_size"], self.params["relevance_channels"], self.params["goal_channels"]) self.map_downsample = DownsampleResidual( self.params["map_to_act_channels"], 2) self.recurrence = RecurrentEmbedding( self.params["gs_fpv_feature_map_size"], self.params["gs_fpv_recurrence_size"]) # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"]) in_features_size = self.params[ "gs_fpv_feature_map_size"] + self.params["emb_size"] if self.use_recurrence: in_features_size += self.params["gs_fpv_recurrence_size"] self.features_to_action = DenseMlpBlock2(in_features_size, self.params["mlp_hidden"], 4) # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- self.add_auxiliary( ClassAuxiliary2D("aux_class", None, self.params["feature_channels"], self.params["num_landmarks"], "fpv_features", "lm_pos_fpv", "lm_indices")) self.add_auxiliary( ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if self.params["templates"]: self.add_auxiliary( ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1, "sentence_embed", "lm_mentioned_tplt")) self.add_auxiliary( ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1, "sentence_embed", "side_mentioned_tplt")) else: self.add_auxiliary( ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0
class ModelGSFPV(nn.Module): def __init__(self, run_name="", aux_class_features=False, aux_grounding_features=False, aux_lang=False, recurrence=False): super(ModelGSFPV, self).__init__() self.model_name = "gs_fpv" + "_mem" if recurrence else "" self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Auxiliary Objectives self.use_aux_class_features = aux_class_features self.use_aux_grounding_features = aux_grounding_features self.use_aux_lang = aux_lang self.use_recurrence = recurrence self.img_to_features_w = FPVToFPVMap(self.params["img_w"], self.params["img_h"], self.params["resnet_channels"], self.params["feature_channels"]) self.lang_filter_gnd = MapLangSemanticFilter( self.params["emb_size"], self.params["feature_channels"], self.params["relevance_channels"]) self.lang_filter_goal = MapLangSpatialFilter( self.params["emb_size"], self.params["relevance_channels"], self.params["goal_channels"]) self.map_downsample = DownsampleResidual( self.params["map_to_act_channels"], 2) self.recurrence = RecurrentEmbedding( self.params["gs_fpv_feature_map_size"], self.params["gs_fpv_recurrence_size"]) # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"]) in_features_size = self.params[ "gs_fpv_feature_map_size"] + self.params["emb_size"] if self.use_recurrence: in_features_size += self.params["gs_fpv_recurrence_size"] self.features_to_action = DenseMlpBlock2(in_features_size, self.params["mlp_hidden"], 4) # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- self.add_auxiliary( ClassAuxiliary2D("aux_class", None, self.params["feature_channels"], self.params["num_landmarks"], "fpv_features", "lm_pos_fpv", "lm_indices")) self.add_auxiliary( ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if self.params["templates"]: self.add_auxiliary( ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1, "sentence_embed", "lm_mentioned_tplt")) self.add_auxiliary( ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1, "sentence_embed", "side_mentioned_tplt")) else: self.add_auxiliary( ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0 # TODO: Try to hide these in a superclass or something. They take up a lot of space: def cuda(self, device=None): ModuleWithAuxiliaries.cuda(self, device) self.sentence_embedding.cuda(device) self.img_to_features_w.cuda(device) self.lang_filter_gnd.cuda(device) self.lang_filter_goal.cuda(device) self.action_loss.cuda(device) self.recurrence.cuda(device) return self def get_iter(self): return int(self.iter.data[0]) def inc_iter(self): self.iter += 1 def init_weights(self): self.img_to_features_w.init_weights() self.lang_filter_gnd.init_weights() self.lang_filter_goal.init_weights() self.sentence_embedding.init_weights() def reset(self): # TODO: This is error prone. Create a class StatefulModule, iterate submodules and reset all stateful modules super(ModelGSFPV, self).reset() self.sentence_embedding.reset() self.img_to_features_w.reset() self.recurrence.reset() self.prev_instruction = None print("GS_FPV_MEM_RESET") def setEnvContext(self, context): print("Set env context to: " + str(context)) self.env_id = context["env_id"] def start_segment_rollout(self, *args): self.reset() def get_action(self, state, instruction): """ Given a DroneState (from PomdpInterface) and instruction, produce a numpy 4D action (x, y, theta, pstop) :param state: DroneState object with the raw image from the simulator :param instruction: Tokenized instruction given the corpus #TODO: Absorb corpus within model :return: """ # TODO: Simplify this self.eval() images_np_pure = state.image state_np = state.state #print("Act: " + debug_untokenize_instruction(instruction)) images_np = standardize_image(images_np_pure) image_fpv = Variable(none_padded_seq_to_tensor([images_np])) state = Variable(none_padded_seq_to_tensor([state_np])) self.prev_instruction = instruction img_in_t = image_fpv img_in_t.volatile = True instr_len = [len(instruction)] if instruction is not None else None instruction = torch.LongTensor(instruction).unsqueeze(0) instruction = cuda_var(instruction, self.is_cuda, self.cuda_device) state.volatile = True if self.is_cuda: img_in_t = img_in_t.cuda(self.cuda_device) state = state.cuda(self.cuda_device) self.seq_step += 1 action = self(img_in_t, state, instruction, instr_len) output_action = action.squeeze().data.cpu().numpy() print("action: ", output_action) stop_prob = output_action[3] output_stop = 1 if stop_prob > self.params["stop_threshold"] else 0 output_action[3] = output_stop return output_action def deterministic_action(self, action_mean, action_std, stop_prob): batch_size = action_mean.size(0) action = Variable( empty_float_tensor((batch_size, 4), self.is_cuda, self.cuda_device)) action[:, 0:3] = action_mean[:, 0:3] action[:, 3] = stop_prob return action def sample_action(self, action_mean, action_std, stop_prob): action = torch.normal(action_mean, action_std) stop = torch.bernoulli(stop_prob) return action, stop # This is called before beginning an execution sequence def start_sequence(self): self.seq_step = 0 self.reset() print("RESETTED!") return # TODO: Move this somewhere and standardize def cam_poses_from_states(self, states): cam_pos = states[:, 9:12] cam_rot = states[:, 12:16] pose = Pose(cam_pos, cam_rot) return pose def forward(self, images, states, instructions, instr_lengths): """ :param images: BxCxHxW batch of images (observations) :param states: BxK batch of drone states :param instructions: BxM LongTensor where M is the maximum length of any instruction :param instr_lengths: list of len B of integers, indicating length of each instruction :param has_obs: list of booleans of length B indicating whether the given element in the sequence has an observation :param yield_semantic_maps: If true, will not compute actions (full model), but return the semantic maps that were built along the way in response to the images. This is ugly, but allows code reuse :return: """ cam_poses = self.cam_poses_from_states(states) self.prof.tick("out") #print("Trn: " + debug_untokenize_instruction(instructions[0].data[:instr_lengths[0]])) # Calculate the instruction embedding if instructions is not None: # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state sent_embeddings = self.sentence_embedding(instructions, instr_lengths) self.keep_inputs("sentence_embed", sent_embeddings) else: sent_embeddings = self.sentence_embedding.get() self.prof.tick("embed") seq_size = len(images) # Extract and project features onto the egocentric frame for each image fpv_features = self.img_to_features_w(images, cam_poses, sent_embeddings, self, show="") self.keep_inputs("fpv_features", fpv_features) self.prof.tick("img_to_map_frame") self.lang_filter_gnd.precompute_conv_weights(sent_embeddings) self.lang_filter_goal.precompute_conv_weights(sent_embeddings) gnd_features = self.lang_filter_gnd(fpv_features) goal_features = self.lang_filter_goal(gnd_features) self.keep_inputs("fpv_features_g", gnd_features) visual_features = torch.cat([gnd_features, goal_features], dim=1) lstm_in_features = visual_features.view([seq_size, 1, -1]) catlist = [lstm_in_features.view([seq_size, -1]), sent_embeddings] if self.use_recurrence: memory_features = self.recurrence(lstm_in_features) catlist.append(memory_features[:, 0, :]) action_features = torch.cat(catlist, dim=1) # Output the final action given the processed map action_pred = self.features_to_action(action_features) action_pred[:, 3] = torch.sigmoid(action_pred[:, 3]) out_action = self.deterministic_action(action_pred[:, 0:3], None, action_pred[:, 3]) self.prof.tick("map_to_action") return out_action def maybe_cuda(self, tensor): if self.is_cuda: return tensor.cuda() else: return tensor def cuda_var(self, tensor): return cuda_var(tensor, self.is_cuda, self.cuda_device) # Forward pass for training (with batch optimizations def sup_loss_on_batch(self, batch, eval): self.prof.tick("out") action_loss_total = Variable( empty_float_tensor([1], self.is_cuda, self.cuda_device)) if batch is None: print("Skipping None Batch") return action_loss_total images = self.maybe_cuda(batch["images"]) instructions = self.maybe_cuda(batch["instr"]) instr_lengths = batch["instr_len"] states = self.maybe_cuda(batch["states"]) actions = self.maybe_cuda(batch["actions"]) # Auxiliary labels lm_pos_fpv = batch["lm_pos_fpv"] lm_indices = batch["lm_indices"] lm_mentioned = batch["lm_mentioned"] lang_lm_mentioned = batch["lang_lm_mentioned"] templates = get_current_parameters()["Environment"]["Templates"] if templates: lm_mentioned_tplt = batch["lm_mentioned_tplt"] side_mentioned_tplt = batch["side_mentioned_tplt"] # stops = self.maybe_cuda(batch["stops"]) masks = self.maybe_cuda(batch["masks"]) metadata = batch["md"] seq_len = images.size(1) batch_size = images.size(0) count = 0 correct_goal_count = 0 goal_count = 0 # Loop thru batch for b in range(batch_size): seg_idx = -1 self.reset() self.prof.tick("out") b_seq_len = len_until_nones(metadata[b]) # TODO: Generalize this # Slice the data according to the sequence length b_metadata = metadata[b][:b_seq_len] b_images = images[b][:b_seq_len] b_instructions = instructions[b][:b_seq_len] b_instr_len = instr_lengths[b][:b_seq_len] b_states = states[b][:b_seq_len] b_actions = actions[b][:b_seq_len] b_lm_pos_fpv = lm_pos_fpv[b][:b_seq_len] b_lm_indices = lm_indices[b][:b_seq_len] b_lm_mentioned = lm_mentioned[b][:b_seq_len] b_lm_pos_fpv = [ self.cuda_var( (s / RESNET_FACTOR).long()) if s is not None else None for s in b_lm_pos_fpv ] b_lm_indices = [ self.cuda_var(s) if s is not None else None for s in b_lm_indices ] b_lm_mentioned = [ self.cuda_var(s) if s is not None else None for s in b_lm_mentioned ] # TODO: Figure out how to keep these properly. Perhaps as a whole batch is best # TODO: Introduce a key-value store (encapsulate instead of inherit) self.keep_inputs("lm_pos_fpv", b_lm_pos_fpv) self.keep_inputs("lm_indices", b_lm_indices) self.keep_inputs("lm_mentioned", b_lm_mentioned) # TODO: Abstract all of these if-elses in a modular way once we know which ones are necessary if templates: b_lm_mentioned_tplt = lm_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = side_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = self.cuda_var(b_side_mentioned_tplt) b_lm_mentioned_tplt = self.cuda_var(b_lm_mentioned_tplt) self.keep_inputs("lm_mentioned_tplt", b_lm_mentioned_tplt) self.keep_inputs("side_mentioned_tplt", b_side_mentioned_tplt) else: b_lang_lm_mentioned = self.cuda_var( lang_lm_mentioned[b][:b_seq_len]) self.keep_inputs("lang_lm_mentioned", b_lang_lm_mentioned) # ---------------------------------------------------------------------------- self.prof.tick("inputs") actions = self(b_images, b_states, b_instructions, b_instr_len) action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False) self.prof.tick("call") action_losses = self.action_loss.batch_reduce_loss(action_losses) action_loss = self.action_loss.reduce_loss(action_losses) action_loss_total = action_loss count += b_seq_len self.prof.tick("loss") action_loss_avg = action_loss_total / (count + 1e-9) self.prof.tick("out") # Doing this in the end (outside of se aux_losses = self.calculate_aux_loss(reduce_average=True) aux_loss = self.combine_aux_losses(aux_losses, self.aux_weights) prefix = self.model_name + ("/eval" if eval else "/train") self.writer.add_dict(prefix, get_current_meters(), self.get_iter()) self.writer.add_dict(prefix, aux_losses, self.get_iter()) self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu()[0], self.get_iter()) self.prof.tick("auxiliaries") total_loss = action_loss_avg + aux_loss self.inc_iter() self.prof.tick("summaries") self.prof.loop() self.prof.print_stats(1) return total_loss def get_dataset(self, data=None, envs=None, dataset_names=None, dataset_prefix=None, eval=False): # TODO: Maybe use eval here #if self.fpv: data_sources = [] data_sources.append(aup.PROVIDER_LM_POS_DATA) data_sources.append(aup.PROVIDER_LANDMARKS_MENTIONED) templates = get_current_parameters()["Environment"]["Templates"] if templates: data_sources.append(aup.PROVIDER_LANG_TEMPLATE) return SegmentDataset(data=data, env_list=envs, dataset_names=dataset_names, dataset_prefix=dataset_prefix, aux_provider_names=data_sources, segment_level=True)
class FPVToEgoMap(MapTransformerBase): def __init__(self, source_map_size, world_size_px, world_size, img_w, img_h, embed_size, map_channels, gnd_channels, res_channels=32, lang_filter=False, img_dbg=False): super(FPVToEgoMap, self).__init__(source_map_size, world_size_px) self.image_debug = img_dbg self.use_lang_filter = lang_filter # Process images using a resnet to get a feature map if self.image_debug: self.img_to_features = nn.MaxPool2d(8) else: # Provide enough padding so that the map is scaled down by powers of 2. self.img_to_features = ImgToFeatures(res_channels, map_channels) if self.use_lang_filter: self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels) # Project feature maps to the global frame self.map_projection = PinholeCameraProjectionModule( source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h) self.grid_sampler = GridSampler() self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.actual_images = None def cuda(self, device=None): MapTransformerBase.cuda(self, device) self.map_projection.cuda(device) self.grid_sampler.cuda(device) self.img_to_features.cuda(device) if self.use_lang_filter: self.lang_filter.cuda(device) def init_weights(self): if not self.image_debug: self.img_to_features.init_weights() def reset(self): self.actual_images = None super(FPVToEgoMap, self).reset() def forward_fpv_features(self, images, sentence_embeds, parent=None): """ Compute the first-person image features given the first-person images If grounding loss is enabled, will also return sentence_embedding conditioned image features :param images: images to compute features on :param sentence_embeds: sentence embeddings for each image :param parent: :return: features_fpv_vis - the visual features extracted using the ResNet features_fpv_gnd - the grounded visual features obtained after applying a 1x1 language-conditioned conv """ # Extract image features. If they've been precomputed ahead of time, just grab it by the provided index features_fpv_vis = self.img_to_features(images) if parent is not None: parent.keep_inputs("fpv_features", features_fpv_vis) self.prof.tick("feat") # If required, pre-process image features by grounding them in language if self.use_lang_filter: self.lang_filter.precompute_conv_weights(sentence_embeds) features_gnd = self.lang_filter(features_fpv_vis) if parent is not None: parent.keep_inputs("fpv_features_g", features_gnd) self.prof.tick("gnd") return features_fpv_vis, features_gnd return features_fpv_vis, None def forward(self, images, poses, sentence_embeds, parent=None, show=""): self.prof.tick("out") features_fpv_vis_only, features_fpv_gnd_only = self.forward_fpv_features(images, sentence_embeds, parent) # If we have grounding features, the overall features are a concatenation of grounded and non-grounded features if features_fpv_gnd_only is not None: features_fpv_all = torch.cat([features_fpv_gnd_only, features_fpv_vis_only], dim=1) else: features_fpv_all = features_fpv_vis_only # Project first-person view features on to the map in egocentric frame grid_maps = self.map_projection(poses) self.prof.tick("proj_map") features_r = self.grid_sampler(features_fpv_all, grid_maps) # Obtain an ego-centric map mask of where we have new information ones_size = list(features_fpv_all.size()) ones_size[1] = 1 tmp_ones = empty_float_tensor(ones_size, self.is_cuda, self.cuda_device).fill_(1.0) new_coverages = self.grid_sampler(tmp_ones, grid_maps) # Make sure that new_coverage is a 0/1 mask (grid_sampler applies bilinear interpolation) new_coverages = new_coverages - torch.min(new_coverages) new_coverages = new_coverages / torch.max(new_coverages) self.prof.tick("gsample") if show != "": Presenter().show_image(images.data[0, 0:3], show + "_img", torch=True, scale=1, waitkey=1) Presenter().show_image(features_r.data[0, 0:3], show, torch=True, scale=6, waitkey=1) Presenter().show_image(new_coverages.data[0], show + "_covg", torch=True, scale=6, waitkey=1) self.prof.loop() self.prof.print_stats(10) return features_r, new_coverages
def __init__(self, run_name, ignore_lang=False, class_loss=True, ground_loss=True): super(ModelTopDownPathGoalPredictor, self).__init__() self.run_name = run_name self.model_name = "top_down_path_pred_pretrain" self.writer = SummaryWriter(log_dir="runs/" + run_name) self.ignore_lang = ignore_lang self.class_loss = class_loss self.ground_loss = ground_loss # The feature net extracts the 2D feature map from the input image. # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image # The output predicted labels are the size of the feature map self.feature_net = ResNet13Light(32, down_pad=True) self.label_pool = nn.MaxPool2d(8) if self.ground_loss: self.lang_filter = MapLangSemanticFilter(sentence_embedding_size, 32, 3) self.aux_ground_linear = nn.Linear(3, 2) enable_weight_saving(self.lang_filter, "ground_filter") enable_weight_saving(self.aux_ground_linear, "ground_aux_linear") if RESNET: self.unet = ResNetConditional(sentence_embedding_size, 35, 2) else: unet_c_in = 35 if self.ground_loss else 32 unet_hc1 = 48 if self.ground_loss else 48 unet_hb1 = 24 if self.ground_loss else 24 self.unet = Unet5ContextualBneck(unet_c_in, 2, sentence_embedding_size, hc1=unet_hc1, hb1=unet_hb1, hc2=128, split_embedding=splitemb) if attention: self.sentence_embedding = SentenceEmbeddingSelfAttention( word_embedding_size, lstm_size, sentence_embedding_layers, attention_heads=attention_heads) else: self.sentence_embedding = SentenceEmbeddingSimple( word_embedding_size, sentence_embedding_size, sentence_embedding_layers) self.gather2d = Gather2D() if self.class_loss: self.aux_class_linear = nn.Linear(32, 64) enable_weight_saving(self.aux_class_linear, "class_aux_linear") print("Sentence Embedding #Params: ", get_n_params(self.sentence_embedding)) print("U-Net #Params: ", get_n_params(self.unet)) print("Class auxiliary: ", self.class_loss) print("Ground auxiliary: ", self.ground_loss) # Enable saving of pre-trained weights enable_weight_saving(self.feature_net, "feature_resnet_light") enable_weight_saving(self.unet, "unet") enable_weight_saving(self.sentence_embedding, "sentence_embedding") if NLL: #self.mask_loss = nn.BCELoss() self.mask_loss = nn.NLLLoss2d() elif BCE: self.mask_loss = nn.BCEWithLogitsLoss() elif CE: self.spatialsoftmax = SpatialSoftmax2d() self.mask_loss = CrossEntropy2d() else: self.mask_loss = nn.MSELoss() self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True) self.epoch_numbers = {"train": 0, "eval": 0} self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.dropout = nn.Dropout(0.5) self.dropout2d = nn.Dropout2d(0.5) self.dropout3d = nn.Dropout3d(0.5) self.viz_images = [] self.instructions = []
class ModelTopDownPathGoalPredictor(CudaModule): def __init__(self, run_name, ignore_lang=False, class_loss=True, ground_loss=True): super(ModelTopDownPathGoalPredictor, self).__init__() self.run_name = run_name self.model_name = "top_down_path_pred_pretrain" self.writer = SummaryWriter(log_dir="runs/" + run_name) self.ignore_lang = ignore_lang self.class_loss = class_loss self.ground_loss = ground_loss # The feature net extracts the 2D feature map from the input image. # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image # The output predicted labels are the size of the feature map self.feature_net = ResNet13Light(32, down_pad=True) self.label_pool = nn.MaxPool2d(8) if self.ground_loss: self.lang_filter = MapLangSemanticFilter(sentence_embedding_size, 32, 3) self.aux_ground_linear = nn.Linear(3, 2) enable_weight_saving(self.lang_filter, "ground_filter") enable_weight_saving(self.aux_ground_linear, "ground_aux_linear") if RESNET: self.unet = ResNetConditional(sentence_embedding_size, 35, 2) else: unet_c_in = 35 if self.ground_loss else 32 unet_hc1 = 48 if self.ground_loss else 48 unet_hb1 = 24 if self.ground_loss else 24 self.unet = Unet5ContextualBneck(unet_c_in, 2, sentence_embedding_size, hc1=unet_hc1, hb1=unet_hb1, hc2=128, split_embedding=splitemb) if attention: self.sentence_embedding = SentenceEmbeddingSelfAttention( word_embedding_size, lstm_size, sentence_embedding_layers, attention_heads=attention_heads) else: self.sentence_embedding = SentenceEmbeddingSimple( word_embedding_size, sentence_embedding_size, sentence_embedding_layers) self.gather2d = Gather2D() if self.class_loss: self.aux_class_linear = nn.Linear(32, 64) enable_weight_saving(self.aux_class_linear, "class_aux_linear") print("Sentence Embedding #Params: ", get_n_params(self.sentence_embedding)) print("U-Net #Params: ", get_n_params(self.unet)) print("Class auxiliary: ", self.class_loss) print("Ground auxiliary: ", self.ground_loss) # Enable saving of pre-trained weights enable_weight_saving(self.feature_net, "feature_resnet_light") enable_weight_saving(self.unet, "unet") enable_weight_saving(self.sentence_embedding, "sentence_embedding") if NLL: #self.mask_loss = nn.BCELoss() self.mask_loss = nn.NLLLoss2d() elif BCE: self.mask_loss = nn.BCEWithLogitsLoss() elif CE: self.spatialsoftmax = SpatialSoftmax2d() self.mask_loss = CrossEntropy2d() else: self.mask_loss = nn.MSELoss() self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True) self.epoch_numbers = {"train": 0, "eval": 0} self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.dropout = nn.Dropout(0.5) self.dropout2d = nn.Dropout2d(0.5) self.dropout3d = nn.Dropout3d(0.5) self.viz_images = [] self.instructions = [] def get_iter(self): return int(self.iter.data[0]) def inc_iter(self): self.iter += 1 def init_weights(self): self.sentence_embedding.init_weights() self.unet.init_weights() if self.ground_loss: self.aux_ground_linear.weight.data.normal_(0.001) self.aux_ground_linear.bias.data.fill_(0) if self.class_loss: self.aux_class_linear.weight.data.normal_(0.001) self.aux_class_linear.bias.data.fill_(0) def cuda(self, device=None): CudaModule.cuda(self, device) self.sentence_embedding.cuda(device) self.unet.cuda(device) if self.ground_loss: self.lang_filter.cuda(device) return self def write_eoe_summaries(self, inference_type, epoch_num): pass def write_summaires(self, prefix, idx, total_loss, main_loss, emb_loss, class_loss, gnd_loss): full_prefix = self.model_name + "/" + prefix + "/" if self.writer is None: return self.writer.add_scalar(full_prefix + "total_loss", total_loss.data[0], idx) self.writer.add_scalar(full_prefix + "main_loss", main_loss.data[0], idx) self.writer.add_scalar(full_prefix + "class_loss", class_loss.data[0], idx) if class_loss is not None: self.writer.add_scalar(full_prefix + "emb_loss", emb_loss.data[0], idx) if gnd_loss is not None: self.writer.add_scalar(full_prefix + "gnd_loss", gnd_loss.data[0], idx) def get_dataset(self, data=None, envs=None, eval=False, dataset_name=None, seg_level=True): return TopDownDataset(env_list=envs, instr_negatives=False, instr_negatives_similar_only=False, seg_level=seg_level, yaw_rand_range=0.0 if eval else YAW_RANGE, img_w=512, img_h=512, map_w=256, map_h=256, incl_path=True, incl_endpoint=True) def get_viz(self): presenter = Presenter() out = {"viz_img": []} for i, img in enumerate(self.viz_images): instruction = self.instructions[i] if len(instruction.view([-1])) < 2: instruction = [0] else: instruction = list(instruction.data.cpu().numpy().squeeze()) instruction_str = debug_untokenize_instruction(instruction) viz_img = presenter.overlay_text(img, instruction_str) out["viz_img"].append(viz_img) return out def forward(self, images, instructions, instruction_masks): emb = self.sentence_embedding(instructions, torch.sum(instruction_masks, 1)) # If the embedding returns an internal auxiliary, loss, pass it along emb_loss = cuda_var(torch.zeros([1]), self.is_cuda, self.cuda_device) if type(emb) is tuple: emb, emb_loss = emb feature_map = self.feature_net(images) feature_map = self.dropout2d(feature_map) if self.ground_loss: self.lang_filter.precompute_conv_weights(emb) ground_map = self.lang_filter(feature_map) feature_map = torch.cat([feature_map, ground_map], dim=1) # TODO: Testing breaking of gradients between ResNet and UNet if cut_gradients: feature_map_fwd = Variable(feature_map.data) else: feature_map_fwd = feature_map #if self.ground_loss: # feature_map_fwd = feature_map_fwd[:, 0:3, :, :] pred_mask = self.unet(feature_map_fwd, emb) return pred_mask, feature_map, emb_loss def sup_loss_on_batch(self, batch, eval=False, viz=False): if eval: self.eval() else: self.train() images = cuda_var(batch["images"], self.is_cuda, self.cuda_device) instructions = cuda_var(batch["instr"], self.is_cuda, self.cuda_device) instruction_masks = cuda_var(batch["instr_mask"], self.is_cuda, self.cuda_device) label_masks = cuda_var(batch["traj_labels"], self.is_cuda, self.cuda_device) # Each of the above is a list of lists of tensors, where the outer list is over the batch and the inner list # is over the segments. Loop through and accumulate loss for each batch sequentially, and for each segment. # Reset model state (embedding etc) between batches, but not between segments. # We don't process each batch in batch-mode, because it's complicated, with the varying number of segments and all. batch_size = len(images) total_class_loss = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device), requires_grad=True) total_ground_loss = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device), requires_grad=True) count = 0 label_masks = self.label_pool(label_masks) mask_pred, features, emb_loss = self(images, instructions, instruction_masks) if BCE: mask_pred_flat = mask_pred.view(-1, 1) label_masks_flat = label_masks - torch.min(label_masks) label_masks_flat = label_masks_flat / ( torch.max(label_masks_flat) + 1e-9) label_masks_flat = label_masks_flat.view(-1, 1).clamp(0, 1) main_loss = self.mask_loss(mask_pred_flat, label_masks_flat) elif NLL: mask_pred_1 = F.softmax(mask_pred, 1, _stacklevel=5) mask_pred_2 = 1 - mask_pred_1 mask_pred_1 = mask_pred_1.unsqueeze(1) mask_pred_2 = mask_pred_2.unsqueeze(1) mask_pred = torch.cat((mask_pred_1, mask_pred_2), dim=1) label_masks = label_masks.clamp(0, 1) if self.is_cuda: label_masks = label_masks.type(torch.cuda.LongTensor) else: label_masks = label_masks.type(torch.LongTensor) main_loss = self.mask_loss(mask_pred, label_masks) elif CE: # Crossentropy2D internally applies logsoftmax to mask_pred, # but labels are already assumed to be a valid probability distribution, so no softmax is applied main_loss = self.mask_loss(mask_pred, label_masks) # So for nice plotting, we must manually do it mask_pred = self.spatialsoftmax(mask_pred) else: main_loss = self.mask_loss(mask_pred, label_masks) # sum emb loss if batch size > 1 if type(emb_loss) == tuple: emb_loss = sum(emb_loss) # Extract the feature vectors corresponding to every landmark's location in the map # Apply a linear layer to classify which of the 64 landmarks it is # The landmark positions have to be divided by the same factor as the ResNet scaling factor lcount = 0 for i in range(batch_size): if self.class_loss and len(batch["lm_pos"][i]) > 0: lcount += 1 landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda, self.cuda_device) landmark_indices = cuda_var(batch["lm_indices"][i], self.is_cuda, self.cuda_device) landmark_coords = (landmark_pos / 8).long() lm_features = self.gather2d(features[i:i + 1, 0:32], landmark_coords) lm_pred = self.aux_class_linear(lm_features) class_loss = self.aux_loss(lm_pred, landmark_indices) total_class_loss = total_class_loss + class_loss if self.ground_loss and len(batch["lm_pos"][i]) > 0: landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda, self.cuda_device) landmark_mentioned = cuda_var(batch["lm_mentioned"][i], self.is_cuda, self.cuda_device) landmark_coords = (landmark_pos / 8).long() g_features = self.gather2d(features[i:i + 1, 32:35], landmark_coords) lm_pred = self.aux_ground_linear(g_features) ground_loss = self.aux_loss(lm_pred, landmark_mentioned) total_ground_loss = total_ground_loss + ground_loss total_class_loss = total_class_loss / (lcount + 1e-9) total_ground_loss = total_ground_loss / (lcount + 1e-9) count += 1 # Just visualization and debugging code if self.get_iter() % 50 == 0: presenter = Presenter() pred_viz_np = presenter.overlaid_image(images[0].data, mask_pred[0].data) labl_viz_np = presenter.overlaid_image(images[0].data, label_masks[0].data) comp = np.concatenate((pred_viz_np, labl_viz_np), axis=1) presenter.show_image(comp, "path_pred") if hasattr(self.sentence_embedding, "save_att_map"): self.sentence_embedding.save_att_map(self.get_iter(), i) total_loss = main_loss + 0.1 * total_class_loss + 0.001 * emb_loss + 0.1 * total_ground_loss total_loss = total_loss / (count + 1e-9) self.write_summaires("eval" if eval else "train", self.get_iter(), total_loss, main_loss, emb_loss, total_class_loss, total_ground_loss) self.inc_iter() return total_loss
class TopDownToEgoMap(MapTransformerBase): def __init__(self, img_in_size=256, world_size_in_img=256, feature_channels=32, ground_channels=3, embed_size=40, aux_ground=False, freeze=False): super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img) # Process images using a resnet to get a feature map self.feature_net = ResNet13Light(feature_channels, down_pad=True) self.aux_ground = aux_ground if aux_ground: self.lang_filter = MapLangSemanticFilter(embed_size, feature_channels, ground_channels) enable_weight_saving(self.lang_filter, "ground_filter", alwaysfreeze=freeze) enable_weight_saving(self.feature_net, "feature_resnet_light", alwaysfreeze=freeze) def cuda(self, device=None): MapTransformerBase.cuda(self, device) self.map_affine.cuda(device) if self.aux_ground: self.lang_filter.cuda(device) return self def init_weights(self): self.feature_net.init_weights() def forward(self, image_g, pose, sentence_embed, parent=None, show=""): # scale to 0-1 range #image_g = image_g - torch.min(image_g) #image_g = image_g / (torch.max(image_g) + 1e-9) # rotate to robot frame # TODO: Temporarily changed to local pose self.set_map(image_g, pose) image_r, _ = self.get_map(pose) """ # normalize mean-0 std-1 image_r = image_r - torch.mean(image_r) image_r = image_r / (torch.std(image_r) + 1e-9) ones = torch.ones_like(image_g) self.set_map(ones, None) cov_r, _ = self.get_map(pose) cov_r = cov_r - torch.min(cov_r) cov_r /= (torch.max(cov_r) + 1e-9) cov_rl = cov_r > 1e-8 blackcolor = torch.min(image_g) #image_r[cov_rl] = blackcolor """ features_r = self.feature_net(image_r) if parent is not None: parent.keep_inputs("fpv_features", features_r) if self.aux_ground: self.lang_filter.precompute_conv_weights(sentence_embed) features_g = self.lang_filter(features_r) if parent is not None: parent.keep_inputs("fpv_features_g", features_g) features_all = torch.cat([features_g, features_r], dim=1) else: features_all = features_r coverage = torch.ones_like(features_all) if show != "": Presenter().show_image(image_r.data[0, 0:3], show + "_img", torch=True, scale=1, waitkey=20) Presenter().show_image(features_r.data[0, 0:3], show, torch=True, scale=12, waitkey=20) #Presenter().show_image(cov_r.data[0, 0:3], show+ "_convg", torch=True, scale=1, waitkey=20) return features_all, coverage