def __init__( self, observation_space, hidden_size, goal_sensor_uuid=None, detach=False, additional_sensors=[] # low dim sensors corresponding to registered name ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.additional_sensors = additional_sensors self._n_input_goal = 0 self._n_input_goal = 0 # if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor": # self.goal_sensor_uuid = goal_sensor_uuid # self._initialize_goal_encoder(observation_space) self._hidden_size = hidden_size resnet_baseplanes = 32 backbone="resnet18" visual_resnet = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, obs_transform=ResizeCenterCropper(size=(256, 256)), backbone_only=True, ) self.detach = detach self.visual_resnet = visual_resnet self.visual_encoder = nn.Sequential( Flatten(), nn.Linear( np.prod(visual_resnet.output_shape), hidden_size ), nn.Sigmoid() ) self.visual_decoder = nn.Sequential( nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.Upsample(scale_factor=2), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.Upsample(scale_factor=2), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.Upsample(scale_factor=2), nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.Upsample(scale_factor=2), nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.Upsample(scale_factor=2), nn.Conv2d(32, 3, kernel_size=3, stride=1, padding=1), ) self.train()
def __init__( self, observation_space, output_size=128, checkpoint="NONE", backbone="resnet50", resnet_baseplanes=32, normalize_visual_inputs=False, trainable=False, spatial_output: bool = False, ): super().__init__() self.visual_encoder = ResNetEncoder( spaces.Dict({"depth": observation_space.spaces["depth"]}), baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=None, ) for param in self.visual_encoder.parameters(): param.requires_grad_(trainable) if checkpoint != "NONE": ddppo_weights = torch.load(checkpoint) weights_dict = {} for k, v in ddppo_weights["state_dict"].items(): split_layer_name = k.split(".")[2:] if split_layer_name[0] != "visual_encoder": continue layer_name = ".".join(split_layer_name[1:]) weights_dict[layer_name] = v del ddppo_weights self.visual_encoder.load_state_dict(weights_dict, strict=True) self.spatial_output = spatial_output if not self.spatial_output: self.output_shape = (output_size, ) self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), output_size), nn.ReLU(True), ) else: self.spatial_embeddings = nn.Embedding( self.visual_encoder.output_shape[1] * self.visual_encoder.output_shape[2], 64, ) self.output_shape = list(self.visual_encoder.output_shape) self.output_shape[0] += self.spatial_embeddings.embedding_dim self.output_shape = tuple(self.output_shape)
def __init__( self, observation_space, action_space, hidden_size, net=SingleBelief, aux_tasks=[], config=None, **kwargs, ): super().__init__( observation_space, action_space, hidden_size, net, aux_tasks=aux_tasks, config=config, **kwargs ) self.medium = config.midlevel_medium self.visual_encoder = None self.visual_resize = nn.Sequential( Flatten(), nn.Linear(2048, hidden_size), nn.ReLU(True) )
def _setup_net(self, rgb_input, feature_dim): self.cnn = nn.Sequential( nn.Conv2d(in_channels=rgb_input, out_channels=32, kernel_size=3, stride=2), nn.ReLU(True), nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2), nn.ReLU(True), nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2), nn.ReLU(True), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2), nn.ReLU(True), ) self.flatten = nn.Sequential( Flatten(), nn.Linear(15 * 15 * 256, feature_dim), # nn.ReLU(True), ) self._init_weight() print(self.cnn) _print_model_parameters(self.cnn)
def forward(self, x): # 21 1 240 240 x = self.maxpool(x) # 21 1 120 120 x = self.conv1(x) # 21 8 30 30 x = nn.ReLU()(x) x = self.maxpool(x) # 21 8 15 15 x = self.conv2(x) # 21 16 8 8 x = nn.ReLU()(x) x = self.maxpool(x) # 21 16 4 4 x = self.conv3(x) # 21 32 2 2 x = nn.ReLU()(x) x = Flatten()(x.contiguous()) # 21 128 x = self.fc(x) # 21 256 return x
def __init__( self, observation_space, action_space, goal_sensor_uuid, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, obs_transform=ResizeCenterCropper(size=(256, 256)), ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) self._n_prev_action = 32 self._n_input_goal = ( observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1) self.tgt_embeding = nn.Linear(self._n_input_goal, 32) self._n_input_goal = 32 self._hidden_size = hidden_size rnn_input_size = self._n_input_goal + self._n_prev_action self.visual_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) if not self.visual_encoder.is_blind: self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def _init_model(self, cnn_dims, output_size): r"""cnn_dims: initial cnn dimensions. """ if self.is_blind: self.cnn = nn.Sequential() return # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=32, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), Contiguous(), Flatten(), nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) self.layer_init()
def forward(self, x): # print("x: ", x.shape) # 1 23 480 480 # x = self.maxpool(x) # 1 23 240 240 x = self.conv1(x) x = nn.ReLU()(x) # print("x: ", x.shape) # 1 32 240 240 x = self.maxpool(x) x = self.conv2(x) x = nn.ReLU()(x) # print("x: ", x.shape) # 1 64 120 120 x = self.maxpool(x) x = self.conv3(x) x = nn.ReLU()(x) # print("x: ", x.shape) # 1 128 60 60 x = self.maxpool(x) x = self.conv4(x) x = nn.ReLU()(x) # print("x: ", x.shape) # 1 64 30 30 x = self.maxpool(x) x = self.conv5(x) x = nn.ReLU()(x) # print("x: ", x.shape) # 1 32 15 15 # print("x: ", x.shape) # 1 32 7 7 x = Flatten()(x.contiguous()) # print("x: ", x.shape) # 1*7200 x = self.fc(x) # 1*512 x = nn.ReLU()(x) return x
def __init__( self, observation_space, hidden_size, goal_sensor_uuid=None, additional_sensors=[ ] # low dim sensors corresponding to registered name ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.additional_sensors = additional_sensors self._n_input_goal = 0 self._n_input_goal = 0 if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor": self.goal_sensor_uuid = goal_sensor_uuid self._initialize_goal_encoder(observation_space) self._hidden_size = hidden_size resnet_baseplanes = 32 backbone = "resnet18" visual_resnet = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, ) self.visual_encoder = nn.Sequential( visual_resnet, Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) final_embedding_size = (0 if self.is_blind else self._hidden_size) + self._n_input_goal for sensor in additional_sensors: final_embedding_size += observation_space.spaces[sensor].shape[0] self.state_encoder = RNNStateEncoder(final_embedding_size, self._hidden_size) self.train()
def __init__( self, observation_space, action_space, hidden_size, net=SingleBelief, aux_tasks=[], # bruh are we even forwarding these things... config=None, **kwargs, # Note, we forward kwargs to the net ): assert issubclass(net, SingleBelief), "Belief policy must use belief net" super().__init__(net( observation_space=observation_space, hidden_size=hidden_size, config=config, # Forward **kwargs, ), action_space.n) self.aux_tasks = aux_tasks resnet_baseplanes = 32 backbone="resnet18" visual_resnet = resnet.ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=config.use_mean_and_var ) self.visual_encoder = nn.Sequential( visual_resnet, Flatten(), nn.Linear( np.prod(visual_resnet.output_shape), hidden_size ), nn.ReLU(True), )
def __init__( self, observation_space, action_space, goal_sensor_uuid, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, use_info_bot, use_odometry, ): super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self._hidden_size = hidden_size self.prev_action_embedding = nn.Embedding(action_space.n + 1, hidden_size) self._n_prev_action = self.prev_action_embedding.embedding_dim self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0] self._tgt_proj = nn.Linear(self._n_input_goal, hidden_size) self._n_input_goal = 32 self.ib = True self.use_info_bot = use_info_bot self.use_odometry = use_odometry if self.ib: self.bottleneck = VIBCompleteLayer(self._hidden_size, self._n_input_goal, self.use_info_bot, self.use_odometry) self.visual_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, ) if not self.visual_encoder.is_blind: after_compression_flat_size = 2048 num_compression_channels = int( round( after_compression_flat_size / ( self.visual_encoder.output_shape[1] * self.visual_encoder.output_shape[2] ) ) ) self.compression = nn.Sequential( resnet.BasicBlock( self.visual_encoder.output_shape[0], self.visual_encoder.output_shape[0], 1, ), resnet.BasicBlock( self.visual_encoder.output_shape[0], num_compression_channels, 1, downsample=nn.Conv2d( self.visual_encoder.output_shape[0], num_compression_channels, 1 ), ), ) self.visual_fc = nn.Sequential( Flatten(), nn.Linear( np.prod(self.visual_encoder.compression_shape), self._hidden_size - self._hidden_size // 4, bias=False, ), nn.LayerNorm(self._hidden_size - self._hidden_size // 4), nn.ReLU(True), ) self.visual_flow_encoder = nn.Sequential( Flatten(), nn.Linear( np.prod(self.visual_encoder.compression_shape), self._hidden_size // 2, bias=False, ), nn.LayerNorm(self._hidden_size // 2), nn.ReLU(True), nn.Linear(self._hidden_size // 2, self._hidden_size // 4, bias=False), nn.LayerNorm(self._hidden_size // 4), nn.ReLU(True), ) self.delta_egomotion_predictor = nn.Linear(self._hidden_size // 4, 3) if rnn_type != "transformer": self.state_encoder = RNNStateEncoder( self._hidden_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) else: self.state_encoder = TransformerStateEncoder( input_size=self._hidden_size, d_model=self._hidden_size ) self.goal_mem_layer = nn.Sequential( nn.Linear( self._hidden_size + (self._n_input_goal if self.ib else 0), self.output_size, ), nn.ReLU(True), ) self.pg_with_gps_pred = nn.Sequential( nn.Linear(self._hidden_size, self._hidden_size // 2), nn.ReLU(True), nn.Linear(self._hidden_size // 2, 3), ) self.train() self.register_buffer("ego_error_threshold", torch.tensor([[0.01]]))
def __init__(self, observation_space, output_size, detector): super().__init__() self.detector = detector if "rgb" in observation_space.spaces: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] else: self._n_input_rgb = 0 if "depth" in observation_space.spaces: self._n_input_depth = observation_space.spaces["depth"].shape[2] else: self._n_input_depth = 0 self._no_classes = observation_space.spaces["goalclass"].shape[0] self._detector_channels = 765 // (3 * 3) # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] if self._n_input_rgb > 0: cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2], dtype=np.float32) elif self._n_input_depth > 0: cnn_dims = np.array(observation_space.spaces["depth"].shape[:2], dtype=np.float32) if self.is_blind: self.cnn_1 = nn.Sequential() self.cnn_2 = nn.Sequential() else: for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn_1 = nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True)) self.detector_cnn = nn.Sequential( nn.Conv2d( in_channels=self._detector_channels + self._no_classes, out_channels=64, kernel_size=1, stride=1, ), nn.ReLU(True), ) self.cnn_2 = nn.Sequential( nn.Conv2d( in_channels=64 + 64, out_channels=128, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), nn.ReLU(True), nn.Conv2d( in_channels=128, out_channels=32, kernel_size=1, stride=1, ), # nn.ReLU(True), Flatten(), nn.Linear(32 * (cnn_dims[0] + 2) * (cnn_dims[1] + 2), output_size), nn.ReLU(True), ) self.layer_init()
def __init__( self, observation_space, hidden_size, goal_sensor_uuid=None, detach=False, imagenet=False, additional_sensors=[ ] # low dim sensors corresponding to registered name ): self.detach = detach self.imagenet = imagenet super().__init__() self.goal_sensor_uuid = goal_sensor_uuid self.additional_sensors = additional_sensors self._n_input_goal = 0 self._n_input_goal = 0 if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor": self.goal_sensor_uuid = goal_sensor_uuid self._initialize_goal_encoder(observation_space) self._hidden_size = hidden_size resnet_baseplanes = 64 backbone = "resnet50" # backbone="resnet18" if imagenet: visual_resnet = TorchVisionResNet50() visual_resnet.eval() else: visual_resnet = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, ) self.detach = detach self.model_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, dense=True, ) self.target_encoder = ResNetEncoder( observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=False, dense=True, ) self.visual_resnet = visual_resnet if imagenet: self.visual_encoder = nn.Sequential( Flatten(), nn.Linear(2048, hidden_size), nn.ReLU(True), ) self.target_image_encoder = nn.Sequential( Flatten(), nn.Linear(2048, hidden_size), nn.ReLU(True), ) else: self.visual_encoder = nn.Sequential( Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) self.target_image_encoder = nn.Sequential( Flatten(), nn.Linear(np.prod(visual_resnet.output_shape), hidden_size), nn.ReLU(True), ) final_embedding_size = (0 if self.is_blind else self._hidden_size) + self._n_input_goal for sensor in additional_sensors: final_embedding_size += observation_space.spaces[sensor].shape[0] if self.goal_sensor_uuid == 'imagegoal': final_embedding_size = 1024 self.state_encoder = nn.Sequential( nn.Linear(final_embedding_size, hidden_size), nn.ReLU(True), nn.Linear(hidden_size, hidden_size)) self.state_policy_encoder = RNNStateEncoder(final_embedding_size, self._hidden_size) self.train()
def _init_perception_model(self, observation_space): if "rgb" in observation_space.spaces: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] else: self._n_input_rgb = 0 if "depth" in observation_space.spaces: self._n_input_depth = observation_space.spaces["depth"].shape[2] else: self._n_input_depth = 0 # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] if self._n_input_rgb > 0: cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2], dtype=np.float32) elif self._n_input_depth > 0: cnn_dims = np.array(observation_space.spaces["depth"].shape[:2], dtype=np.float32) if self.is_blind: return nn.Sequential() else: for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) return nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(), nn.Conv2d( in_channels=64, out_channels=32, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), Flatten(), nn.Linear(32 * cnn_dims[0] * cnn_dims[1], self._hidden_size), nn.ReLU(), )
def __init__(self, observation_space, output_size, drop_prob=0.5, channel_scale=1): super().__init__() if "rgb" in observation_space.spaces: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] else: self._n_input_rgb = 0 if "depth" in observation_space.spaces: self._n_input_depth = observation_space.spaces["depth"].shape[2] else: self._n_input_depth = 0 # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] self._drop_prob = drop_prob print("i am here---" * 100) if self._n_input_rgb > 0: cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2], dtype=np.float32) elif self._n_input_depth > 0: cnn_dims = np.array(observation_space.spaces["depth"].shape[:2], dtype=np.float32) if self.is_blind: self.cnn = nn.Sequential() else: for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) ds = channel_scale self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32 * ds, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.BatchNorm2d(32 * ds, affine=False), nn.ReLU(True), nn.Conv2d( in_channels=32 * ds, out_channels=64 * ds, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.BatchNorm2d(64 * ds, affine=False), nn.ReLU(True), nn.Conv2d( in_channels=64 * ds, out_channels=32 * ds, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), nn.BatchNorm2d(32 * ds, affine=False), nn.ELU(True), Flatten(), nn.Linear(32 * ds * cnn_dims[0] * cnn_dims[1], output_size), nn.BatchNorm1d(output_size, affine=False), nn.ReLU(True), ) self.layer_init()
def __init__( self, observation_space, action_space, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, obs_transform=ResizeCenterCropper(size=(256, 256)), force_blind_policy=False, ): super().__init__() self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) self._n_prev_action = 32 rnn_input_size = self._n_prev_action if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid in observation_space.spaces): n_input_goal = (observation_space.spaces[ IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0] + 1) self.tgt_embeding = nn.Linear(n_input_goal, 32) rnn_input_size += 32 if ObjectGoalSensor.cls_uuid in observation_space.spaces: self._n_object_categories = (int( observation_space.spaces[ObjectGoalSensor.cls_uuid].high[0]) + 1) self.obj_categories_embedding = nn.Embedding( self._n_object_categories, 32) rnn_input_size += 32 if EpisodicGPSSensor.cls_uuid in observation_space.spaces: input_gps_dim = observation_space.spaces[ EpisodicGPSSensor.cls_uuid].shape[0] self.gps_embedding = nn.Linear(input_gps_dim, 32) rnn_input_size += 32 if PointGoalSensor.cls_uuid in observation_space.spaces: input_pointgoal_dim = observation_space.spaces[ PointGoalSensor.cls_uuid].shape[0] self.pointgoal_embedding = nn.Linear(input_pointgoal_dim, 32) rnn_input_size += 32 if HeadingSensor.cls_uuid in observation_space.spaces: input_heading_dim = ( observation_space.spaces[HeadingSensor.cls_uuid].shape[0] + 1) assert input_heading_dim == 2, "Expected heading with 2D rotation." self.heading_embedding = nn.Linear(input_heading_dim, 32) rnn_input_size += 32 if ProximitySensor.cls_uuid in observation_space.spaces: input_proximity_dim = observation_space.spaces[ ProximitySensor.cls_uuid].shape[0] self.proximity_embedding = nn.Linear(input_proximity_dim, 32) rnn_input_size += 32 if EpisodicCompassSensor.cls_uuid in observation_space.spaces: assert (observation_space.spaces[EpisodicCompassSensor.cls_uuid]. shape[0] == 1), "Expected compass with 2D rotation." input_compass_dim = 2 # cos and sin of the angle self.compass_embedding = nn.Linear(input_compass_dim, 32) rnn_input_size += 32 if ImageGoalSensor.cls_uuid in observation_space.spaces: goal_observation_space = spaces.Dict( {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]}) self.goal_visual_encoder = ResNetEncoder( goal_observation_space, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) self.goal_visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.goal_visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) rnn_input_size += hidden_size self._hidden_size = hidden_size self.visual_encoder = ResNetEncoder( observation_space if not force_blind_policy else spaces.Dict({}), baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) if not self.visual_encoder.is_blind: self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def __init__( self, observation_space, output_size, obs_transform: nn.Module = ResizeCenterCropper(size=(256, 256)), ): super().__init__() self.obs_transform = obs_transform if self.obs_transform is not None: observation_space = obs_transform.transform_observation_space( observation_space) if "rgb" in observation_space.spaces: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] else: self._n_input_rgb = 0 if "depth" in observation_space.spaces: self._n_input_depth = observation_space.spaces["depth"].shape[2] else: self._n_input_depth = 0 # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] if self._n_input_rgb > 0: cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2], dtype=np.float32) elif self._n_input_depth > 0: cnn_dims = np.array(observation_space.spaces["depth"].shape[:2], dtype=np.float32) if self.is_blind: self.cnn = nn.Sequential() else: for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=32, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), # nn.ReLU(True), Flatten(), nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) self.layer_init()