def __init__(self, observation_space, hidden_size): super().__init__() if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid in observation_space.spaces): self._n_input_goal = observation_space.spaces[ IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0] elif PointGoalSensor.cls_uuid in observation_space.spaces: self._n_input_goal = observation_space.spaces[ PointGoalSensor.cls_uuid].shape[0] elif ImageGoalSensor.cls_uuid in observation_space.spaces: goal_observation_space = spaces.Dict( {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]}) self.goal_visual_encoder = SimpleCNN(goal_observation_space, hidden_size) self._n_input_goal = hidden_size self._hidden_size = hidden_size self.visual_encoder = SimpleCNN(observation_space, hidden_size) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + self._n_input_goal, self._hidden_size, ) self.train()
def __init__( self, observation_space, action_space, goal_sensor_uuid, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, obs_transform=ResizeCenterCropper(size=(256, 256)), ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) self._n_prev_action = 32 self._n_input_goal = ( observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1) self.tgt_embeding = nn.Linear(self._n_input_goal, 32) self._n_input_goal = 32 self._hidden_size = hidden_size rnn_input_size = self._n_input_goal + self._n_prev_action self.visual_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) if not self.visual_encoder.is_blind: self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def __init__(self, observation_space: Space, model_config: Config, num_actions): super().__init__() self.model_config = model_config # Init the depth encoder assert model_config.DEPTH_ENCODER.cnn_type in model_config.DEPTH_ENCODER.supported_encoders, \ f"DEPTH_ENCODER.cnn_type must be in {model_config.DEPTH_ENCODER.supported_encoders}" if model_config.DEPTH_ENCODER.cnn_type == "DepthEncoderResnet50": self.depth_encoder = DepthEncoderResnet50( observation_space, output_size=model_config.DEPTH_ENCODER.output_size, checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, backbone=model_config.DEPTH_ENCODER.backbone, ) # Init the RGB visual encoder assert model_config.RGB_ENCODER.cnn_type in model_config.RGB_ENCODER.supported_encoders, \ f"RGB_ENCODER.cnn_type must be in {model_config.RGB_ENCODER.supported_encoders}" if model_config.RGB_ENCODER.cnn_type == "RGBEncoderResnet50": device = (torch.device("cuda", model_config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) self.rgb_encoder = RGBEncoderResnet50( observation_space, model_config.RGB_ENCODER.output_size, device) if model_config.SEQ2SEQ.use_prev_action: self.prev_action_embedding = nn.Embedding(num_actions + 1, 32) # Init the RNN state decoder rnn_input_size = (model_config.DEPTH_ENCODER.output_size + model_config.RGB_ENCODER.output_size) if model_config.SEQ2SEQ.use_pointgoal: rnn_input_size += (observation_space. spaces["pointgoal_with_gps_compass"].shape[0]) if model_config.SEQ2SEQ.use_heading: rnn_input_size += (observation_space.spaces["heading"].shape[0]) if model_config.SEQ2SEQ.use_prev_action: rnn_input_size += self.prev_action_embedding.embedding_dim self.state_encoder = RNNStateEncoder( input_size=rnn_input_size, hidden_size=model_config.STATE_ENCODER.hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self.train()
def __init__(self, observation_space, hidden_size, goal_sensor_uuid): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0] self._hidden_size = hidden_size self.visual_encoder = SimpleCNN(observation_space, hidden_size) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + self._n_input_goal, self._hidden_size, ) self.train()
def __init__(self, cfg, observation_space, hidden_size, goal_sensor_uuid, with_target_encoding, device, visual_encoder="SimpleCNN", drop_prob=0.5, channel_scale=1): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.with_target_encoding = with_target_encoding num_recurrent_layers = getattr(cfg, "num_recurrent_layers", 1) rnn_type = getattr(cfg, "rnn_type", "GRU") self._n_input_goal = observation_space.spaces[ self.goal_sensor_uuid].shape[0] self._hidden_size = hidden_size self.visual_encoder = VISUAL_ENCODER_MODELS[visual_encoder]( observation_space, hidden_size, drop_prob=drop_prob, channel_scale=channel_scale) visual_feat_size = 0 if self.is_blind else self._hidden_size rnn_out_size = self._hidden_size t_enc_size = self._n_input_goal if with_target_encoding else 0 self.aux_models = aux_models = torch.nn.ModuleDict({}) for aux_type in cfg.aux: aux_cfg = getattr(cfg, AUX_CLASSES[aux_type].__name__) aux_models[aux_type] = AUX_CLASSES[aux_type]( aux_cfg, visual_feat_size, t_enc_size, rnn_out_size, observation_space=observation_space) print(rnn_type, num_recurrent_layers) self.state_encoder = RNNStateEncoder(visual_feat_size + t_enc_size, self._hidden_size, num_layers=num_recurrent_layers, rnn_type=rnn_type) self.train()
def __init__( self, observation_space, action_space, goal_sensor_uuid, hidden_size, num_recurrent_layers, rnn_type, backbone, normalize_visual_inputs, pretrained=False, finetune=False, ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) self._n_prev_action = 32 self._n_input_goal = ( observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1) self.tgt_embeding = nn.Linear(self._n_input_goal, 32) self._n_input_goal = 32 self._hidden_size = hidden_size rnn_input_size = self._n_input_goal + self._n_prev_action self.visual_encoder = EfficientNetEncoder( observation_space, hidden_size=hidden_size, backbone_name=backbone, pretrained=pretrained, finetune=finetune, normalize_visual_inputs=normalize_visual_inputs, ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def __init__(self, observation_space, hidden_size, goal_sensor_uuid, detector_config, device): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self._n_input_goal = observation_space.spaces[ self.goal_sensor_uuid ].shape[0] self._hidden_size = hidden_size self.detector = detector = YoloDetector(detector_config, device) self.visual_encoder = AimasCNN(observation_space, hidden_size, detector) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + self._n_input_goal, self._hidden_size, ) self.train()
def __init__( self, observation_space, hidden_size, goal_sensor_uuid=None, additional_sensors=[ ] # low dim sensors corresponding to registered name ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.additional_sensors = additional_sensors self._n_input_goal = 0 self._n_input_goal = 0 if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor": self.goal_sensor_uuid = goal_sensor_uuid self._initialize_goal_encoder(observation_space) self._hidden_size = hidden_size resnet_baseplanes = 32 backbone = "resnet18" visual_resnet = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, ) self.visual_encoder = nn.Sequential( visual_resnet, Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) final_embedding_size = (0 if self.is_blind else self._hidden_size) + self._n_input_goal for sensor in additional_sensors: final_embedding_size += observation_space.spaces[sensor].shape[0] self.state_encoder = RNNStateEncoder(final_embedding_size, self._hidden_size) self.train()
def __init__(self, observation_space, hidden_size, goal_sensor_uuid, with_target_encoding, device, visual_encoder="SimpleCNN", drop_prob=0.5, channel_scale=1): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.with_target_encoding = with_target_encoding self._n_input_goal = observation_space.spaces[ self.goal_sensor_uuid ].shape[0] self._hidden_size = hidden_size self.visual_encoder = VISUAL_ENCODER_MODELS[visual_encoder]( observation_space, hidden_size, drop_prob=drop_prob, channel_scale=channel_scale) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + (self._n_input_goal if with_target_encoding else 0), self._hidden_size, ) self.train()
def __init__(self, observation_space: Space, num_actions: int, num_sub_tasks: int, model_config: Config, batch_size: int): super().__init__() self.model_config = model_config self.batch_size = batch_size self.num_sub_tasks = num_sub_tasks device = (torch.device("cuda", model_config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) # Init the instruction encoder if model_config.INSTRUCTION_ENCODER.is_bert: self.instruction_encoder = LanguageEncoder( model_config.INSTRUCTION_ENCODER, device) else: self.instruction_encoder = InstructionEncoder( model_config.INSTRUCTION_ENCODER) # Init the depth encoder assert model_config.DEPTH_ENCODER.cnn_type in [ "SimpleDepthCNN", "VlnResnetDepthEncoder", ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder" if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN": self.depth_encoder = SimpleDepthCNN( observation_space, model_config.DEPTH_ENCODER.output_size) elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder": self.depth_encoder = VlnResnetDepthEncoder( observation_space, output_size=model_config.DEPTH_ENCODER.output_size, checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, backbone=model_config.DEPTH_ENCODER.backbone, ) # Init the RGB visual encoder assert model_config.RGB_ENCODER.cnn_type in [ "SimpleRGBCNN", "TorchVisionResNet50", ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'." if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN": self.rgb_encoder = SimpleRGBCNN( observation_space, model_config.RGB_ENCODER.output_size) elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50": self.rgb_encoder = TorchVisionResNet50( observation_space, model_config.RGB_ENCODER.output_size, model_config.RGB_ENCODER.resnet_output_size, device) if model_config.SEQ2SEQ.use_prev_action: self.prev_action_embedding = nn.Embedding(num_actions + 1, 32) # Init the RNN state decoder rnn_input_size = (self.instruction_encoder.output_size + model_config.DEPTH_ENCODER.output_size + model_config.RGB_ENCODER.output_size) if model_config.SEQ2SEQ.use_prev_action: rnn_input_size += self.prev_action_embedding.embedding_dim self.state_encoder = RNNStateEncoder( input_size=rnn_input_size, hidden_size=model_config.STATE_ENCODER.hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self.progress_monitor = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, 1) self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size, num_actions) self.sub_goal_linear = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, self.num_sub_tasks) self.stop_linear = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, 1) self._init_layers()
def __init__( self, observation_space, action_space, goal_sensor_uuid, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, use_info_bot, use_odometry, ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self._hidden_size = hidden_size self.prev_action_embedding = nn.Embedding(action_space.n + 1, hidden_size) self._n_prev_action = self.prev_action_embedding.embedding_dim self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0] self._tgt_proj = nn.Linear(self._n_input_goal, hidden_size) self._n_input_goal = 32 self.ib = True self.use_info_bot = use_info_bot self.use_odometry = use_odometry if self.ib: self.bottleneck = VIBCompleteLayer(self._hidden_size, self._n_input_goal, self.use_info_bot, self.use_odometry) self.visual_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, ) if not self.visual_encoder.is_blind: after_compression_flat_size = 2048 num_compression_channels = int( round( after_compression_flat_size / ( self.visual_encoder.output_shape[1] * self.visual_encoder.output_shape[2] ) ) ) self.compression = nn.Sequential( resnet.BasicBlock( self.visual_encoder.output_shape[0], self.visual_encoder.output_shape[0], 1, ), resnet.BasicBlock( self.visual_encoder.output_shape[0], num_compression_channels, 1, downsample=nn.Conv2d( self.visual_encoder.output_shape[0], num_compression_channels, 1 ), ), ) self.visual_fc = nn.Sequential( Flatten(), nn.Linear( np.prod(self.visual_encoder.compression_shape), self._hidden_size - self._hidden_size // 4, bias=False, ), nn.LayerNorm(self._hidden_size - self._hidden_size // 4), nn.ReLU(True), ) self.visual_flow_encoder = nn.Sequential( Flatten(), nn.Linear( np.prod(self.visual_encoder.compression_shape), self._hidden_size // 2, bias=False, ), nn.LayerNorm(self._hidden_size // 2), nn.ReLU(True), nn.Linear(self._hidden_size // 2, self._hidden_size // 4, bias=False), nn.LayerNorm(self._hidden_size // 4), nn.ReLU(True), ) self.delta_egomotion_predictor = nn.Linear(self._hidden_size // 4, 3) if rnn_type != "transformer": self.state_encoder = RNNStateEncoder( self._hidden_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) else: self.state_encoder = TransformerStateEncoder( input_size=self._hidden_size, d_model=self._hidden_size ) self.goal_mem_layer = nn.Sequential( nn.Linear( self._hidden_size + (self._n_input_goal if self.ib else 0), self.output_size, ), nn.ReLU(True), ) self.pg_with_gps_pred = nn.Sequential( nn.Linear(self._hidden_size, self._hidden_size // 2), nn.ReLU(True), nn.Linear(self._hidden_size // 2, 3), ) self.train() self.register_buffer("ego_error_threshold", torch.tensor([[0.01]]))
def __init__(self, observation_space: Space, num_actions: int, model_config: Config): super().__init__() self.model_config = model_config model_config.defrost() model_config.INSTRUCTION_ENCODER.final_state_only = False model_config.freeze() # Init the instruction encoder self.instruction_encoder = InstructionEncoder( model_config.INSTRUCTION_ENCODER) # Init the depth encoder assert model_config.DEPTH_ENCODER.cnn_type in [ "VlnResnetDepthEncoder" ], "DEPTH_ENCODER.cnn_type must be VlnResnetDepthEncoder" self.depth_encoder = VlnResnetDepthEncoder( observation_space, output_size=model_config.DEPTH_ENCODER.output_size, checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, backbone=model_config.DEPTH_ENCODER.backbone, spatial_output=True, ) # Init the RGB encoder assert model_config.RGB_ENCODER.cnn_type in [ "TorchVisionResNet50" ], "RGB_ENCODER.cnn_type must be TorchVisionResNet50'." device = (torch.device("cuda", model_config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) self.rgb_encoder = TorchVisionResNet50( observation_space, model_config.RGB_ENCODER.output_size, model_config.RGB_ENCODER.resnet_output_size, device, spatial_output=True, ) if model_config.CMA.use_prev_action: self.prev_action_embedding = nn.Embedding(num_actions + 1, 32) self.rcm_state_encoder = model_config.CMA.rcm_state_encoder hidden_size = model_config.STATE_ENCODER.hidden_size self._hidden_size = hidden_size if self.rcm_state_encoder: self.state_encoder = RCMStateEncoder( self.rgb_encoder.output_shape[0], self.depth_encoder.output_shape[0], model_config.STATE_ENCODER.hidden_size, self.prev_action_embedding.embedding_dim, ) else: self.rgb_linear = nn.Sequential( nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Linear( self.rgb_encoder.output_shape[0], model_config.RGB_ENCODER.output_size, ), nn.ReLU(True), ) self.depth_linear = nn.Sequential( nn.Flatten(), nn.Linear( np.prod(self.depth_encoder.output_shape), model_config.DEPTH_ENCODER.output_size, ), nn.ReLU(True), ) # Init the RNN state decoder rnn_input_size = model_config.DEPTH_ENCODER.output_size rnn_input_size += model_config.RGB_ENCODER.output_size if model_config.CMA.use_prev_action: rnn_input_size += self.prev_action_embedding.embedding_dim self.state_encoder = RNNStateEncoder( input_size=rnn_input_size, hidden_size=model_config.STATE_ENCODER.hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self._output_size = (model_config.STATE_ENCODER.hidden_size + model_config.RGB_ENCODER.output_size + model_config.DEPTH_ENCODER.output_size + self.instruction_encoder.output_size) self.rgb_kv = nn.Conv1d( self.rgb_encoder.output_shape[0], hidden_size // 2 + model_config.RGB_ENCODER.output_size, 1, ) self.depth_kv = nn.Conv1d( self.depth_encoder.output_shape[0], hidden_size // 2 + model_config.DEPTH_ENCODER.output_size, 1, ) # self.depth_kv = nn.Conv1d( # self.depth_encoder.output_shape[0], # hidden_size, # 1, # ) self.state_q = nn.Linear(hidden_size, hidden_size // 2) self.text_k = nn.Conv1d(self.instruction_encoder.output_size, hidden_size // 2, 1) self.text_q = nn.Linear(self.instruction_encoder.output_size, hidden_size // 2) self.register_buffer("_scale", torch.tensor(1.0 / ((hidden_size // 2)**0.5))) if model_config.CMA.use_prev_action: self.second_state_compress = nn.Sequential( nn.Linear( self._output_size + self.prev_action_embedding.embedding_dim, self._hidden_size, ), nn.ReLU(True), ) else: self.second_state_compress = nn.Sequential( nn.Linear( self._output_size, self._hidden_size, ), nn.ReLU(True), ) self.second_state_encoder = RNNStateEncoder( input_size=self._hidden_size, hidden_size=self._hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self._output_size = model_config.STATE_ENCODER.hidden_size self.progress_monitor = nn.Linear(self.output_size, 1) self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size, num_actions) self.stop_linear = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, 1) self._init_layers() self.train()
def _initialize_state_encoder(self): self.state_encoder = RNNStateEncoder(self._embedding_size, self._hidden_size)
def _initialize_state_encoder(self): self.state_encoders = nn.ModuleList([ RNNStateEncoder(self._embedding_size, self._hidden_size) for _ in range(self.num_tasks) ])
def __init__( self, observation_space, hidden_size, goal_sensor_uuid=None, detach=False, imagenet=False, additional_sensors=[ ] # low dim sensors corresponding to registered name ): self.detach = detach self.imagenet = imagenet super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.additional_sensors = additional_sensors self._n_input_goal = 0 self._n_input_goal = 0 if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor": self.goal_sensor_uuid = goal_sensor_uuid self._initialize_goal_encoder(observation_space) self._hidden_size = hidden_size resnet_baseplanes = 64 backbone = "resnet50" # backbone="resnet18" if imagenet: visual_resnet = TorchVisionResNet50() visual_resnet.eval() else: visual_resnet = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, ) self.detach = detach self.model_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, dense=True, ) self.target_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, dense=True, ) self.visual_resnet = visual_resnet if imagenet: self.visual_encoder = nn.Sequential( Flatten(), nn.Linear(2048, hidden_size), nn.ReLU(True), ) self.target_image_encoder = nn.Sequential( Flatten(), nn.Linear(2048, hidden_size), nn.ReLU(True), ) else: self.visual_encoder = nn.Sequential( Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) self.target_image_encoder = nn.Sequential( Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) final_embedding_size = (0 if self.is_blind else self._hidden_size) + self._n_input_goal for sensor in additional_sensors: final_embedding_size += observation_space.spaces[sensor].shape[0] if self.goal_sensor_uuid == 'imagegoal': final_embedding_size = 1024 self.state_encoder = nn.Sequential( nn.Linear(final_embedding_size, hidden_size), nn.ReLU(True), nn.Linear(hidden_size, hidden_size)) self.state_policy_encoder = RNNStateEncoder(final_embedding_size, self._hidden_size) self.train()
def __init__(self, observation_space: Space, model_config: Config, num_actions): super().__init__() self.model_config = model_config # Init the instruction encoder self.instruction_encoder = InstructionEncoder2(model_config.INSTRUCTION_ENCODER) # Init the depth encoder assert model_config.DEPTH_ENCODER.cnn_type in [ "SimpleDepthCNN", "VlnResnetDepthEncoder", ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder" if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN": self.depth_encoder = SimpleDepthCNN( observation_space, model_config.DEPTH_ENCODER.output_size ) elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder": self.depth_encoder = VlnResnetDepthEncoder( observation_space, output_size=model_config.DEPTH_ENCODER.output_size, checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, backbone=model_config.DEPTH_ENCODER.backbone, ) # Init the RGB visual encoder assert model_config.RGB_ENCODER.cnn_type in [ "SimpleRGBCNN", "TorchVisionResNet50", ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'." if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN": self.rgb_encoder = SimpleRGBCNN( observation_space, model_config.RGB_ENCODER.output_size ) elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50": device = ( torch.device("cuda", model_config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu") ) self.rgb_encoder = TorchVisionResNet50( observation_space, model_config.RGB_ENCODER.output_size, device ) if model_config.SEQ2SEQ.use_prev_action: self.prev_action_embedding = nn.Embedding(num_actions + 1, 32) # Init the RNN state decoder rnn_input_size = ( self.instruction_encoder.output_size + model_config.DEPTH_ENCODER.output_size + model_config.RGB_ENCODER.output_size ) if model_config.SEQ2SEQ.use_prev_action: rnn_input_size += self.prev_action_embedding.embedding_dim self.state_encoder = RNNStateEncoder( input_size=rnn_input_size, hidden_size=model_config.STATE_ENCODER.hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self.progress_monitor = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, 1 ) self._init_layers() # bert # import gzip # with gzip.open(model_config.INSTRUCTION_ENCODER.embedding_file, "rt") as f: # import json # embeddings = torch.tensor(json.load(f)) # if model_config.INSTRUCTION_ENCODER.use_pretrained_embeddings: # self.embedding_layer = nn.Embedding.from_pretrained( # embeddings=embeddings, # freeze=not model_config.INSTRUCTION_ENCODER.fine_tune_embeddings, # ) # else: # each embedding initialized to sampled Gaussian # self.embedding_layer = nn.Embedding( # num_embeddings=model_config.INSTRUCTION_ENCODER.vocab_size, # embedding_dim=model_config.INSTRUCTION_ENCODER.embedding_size, # padding_idx=0, # ) # configuration = BertConfig(hidden_size=model_config.INSTRUCTION_ENCODER.hidden_size, # vocab_size_or_config_json_file=model_config.INSTRUCTION_ENCODER.vocab_size, # num_attention_heads=8, # ) # self.bert = BertModel(configuration) import json with open("index_to_word.json") as f: self.idx_to_word = list(json.load(f)["word"].keys()) print(self.idx_to_word) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.bert = BertModel.from_pretrained('bert-base-uncased') self.train()
def __init__( self, observation_space, action_space, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, obs_transform=ResizeCenterCropper(size=(256, 256)), force_blind_policy=False, ): super().__init__() self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) self._n_prev_action = 32 rnn_input_size = self._n_prev_action if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid in observation_space.spaces): n_input_goal = (observation_space.spaces[ IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0] + 1) self.tgt_embeding = nn.Linear(n_input_goal, 32) rnn_input_size += 32 if ObjectGoalSensor.cls_uuid in observation_space.spaces: self._n_object_categories = (int( observation_space.spaces[ObjectGoalSensor.cls_uuid].high[0]) + 1) self.obj_categories_embedding = nn.Embedding( self._n_object_categories, 32) rnn_input_size += 32 if EpisodicGPSSensor.cls_uuid in observation_space.spaces: input_gps_dim = observation_space.spaces[ EpisodicGPSSensor.cls_uuid].shape[0] self.gps_embedding = nn.Linear(input_gps_dim, 32) rnn_input_size += 32 if PointGoalSensor.cls_uuid in observation_space.spaces: input_pointgoal_dim = observation_space.spaces[ PointGoalSensor.cls_uuid].shape[0] self.pointgoal_embedding = nn.Linear(input_pointgoal_dim, 32) rnn_input_size += 32 if HeadingSensor.cls_uuid in observation_space.spaces: input_heading_dim = ( observation_space.spaces[HeadingSensor.cls_uuid].shape[0] + 1) assert input_heading_dim == 2, "Expected heading with 2D rotation." self.heading_embedding = nn.Linear(input_heading_dim, 32) rnn_input_size += 32 if ProximitySensor.cls_uuid in observation_space.spaces: input_proximity_dim = observation_space.spaces[ ProximitySensor.cls_uuid].shape[0] self.proximity_embedding = nn.Linear(input_proximity_dim, 32) rnn_input_size += 32 if EpisodicCompassSensor.cls_uuid in observation_space.spaces: assert (observation_space.spaces[EpisodicCompassSensor.cls_uuid]. shape[0] == 1), "Expected compass with 2D rotation." input_compass_dim = 2 # cos and sin of the angle self.compass_embedding = nn.Linear(input_compass_dim, 32) rnn_input_size += 32 if ImageGoalSensor.cls_uuid in observation_space.spaces: goal_observation_space = spaces.Dict( {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]}) self.goal_visual_encoder = ResNetEncoder( goal_observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) self.goal_visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.goal_visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) rnn_input_size += hidden_size self._hidden_size = hidden_size self.visual_encoder = ResNetEncoder( observation_space if not force_blind_policy else spaces.Dict({}), baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) if not self.visual_encoder.is_blind: self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def __init__(self, observation_space: Space, num_actions: int, model_config: Config, batch_size: int): super().__init__() self.model_config = model_config self.batch_size = batch_size device = ( torch.device("cuda", model_config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu") ) ## BERT Embedding self.embedding_layer = BertModel.from_pretrained('bert-base-uncased') self.ins_fc = nn.Linear(model_config.TRANSFORMER_INSTRUCTION_ENCODER.d_in, model_config.TRANSFORMER_INSTRUCTION_ENCODER.d_model) assert model_config.DEPTH_ENCODER.cnn_type in [ "SimpleDepthCNN", "VlnResnetDepthEncoder", ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder" if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN": self.depth_encoder = SimpleDepthCNN( observation_space, model_config.DEPTH_ENCODER.output_size ) elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder": self.depth_encoder = VlnResnetDepthEncoder( observation_space, output_size=model_config.DEPTH_ENCODER.output_size, checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, backbone=model_config.DEPTH_ENCODER.backbone, spatial_output=True, ) # Init the RGB visual encoder assert model_config.RGB_ENCODER.cnn_type in [ "SimpleRGBCNN", "TorchVisionResNet50", ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'." if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN": self.rgb_encoder = SimpleRGBCNN( observation_space, model_config.RGB_ENCODER.output_size ) elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50": self.rgb_encoder = TorchVisionResNet50( observation_space, model_config.RGB_ENCODER.output_size, model_config.RGB_ENCODER.resnet_output_size, device, spatial_output=True, ) self.rgb_linear = nn.Sequential( nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Linear( self.rgb_encoder.output_shape[0], model_config.RGB_ENCODER.output_size, ), nn.ReLU(True), ) self.depth_linear = nn.Sequential( nn.Flatten(), nn.Linear( np.prod(self.depth_encoder.output_shape), model_config.DEPTH_ENCODER.output_size, ), nn.ReLU(True), ) self.rgb_kv = nn.Conv1d( self.rgb_encoder.output_shape[0], model_config.VISUAL_LING_ATTN.vis_in_features, 1, ) self.depth_kv = nn.Conv1d( self.depth_encoder.output_shape[0], model_config.VISUAL_LING_ATTN.vis_in_features, 1, ) self.image_cm_encoder = Visual_Ling_Attn(model_config.VISUAL_LING_ATTN) self.cross_pooler = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten()) if model_config.SEQ2SEQ.use_prev_action: self.prev_action_embedding = nn.Embedding(num_actions + 1, 32) rnn_input_size = ( self.model_config.IMAGE_CROSS_MODAL_ENCODER.d_model*2 + model_config.DEPTH_ENCODER.output_size + model_config.RGB_ENCODER.output_size ) if model_config.SEQ2SEQ.use_prev_action: rnn_input_size += self.prev_action_embedding.embedding_dim self.state_encoder = RNNStateEncoder( input_size=rnn_input_size, hidden_size=model_config.STATE_ENCODER.hidden_size, num_layers=1, rnn_type=model_config.STATE_ENCODER.rnn_type, ) self.progress_monitor = nn.Linear( self.model_config.STATE_ENCODER.hidden_size, 1 ) self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size, num_actions) self._init_layers()