def __init__( self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, hidden_size=512, obj_state_embedding_size=512, trainable_masked_hidden_state: bool = False, num_rnn_layers=1, rnn_type="GRU", ): """Initializer. See class documentation for parameter definitions. """ super().__init__(action_space=action_space, observation_space=observation_space) self._hidden_size = hidden_size self.object_type_embedding_size = obj_state_embedding_size self.visual_encoder_pick = SimpleCNN( self.observation_space, self._hidden_size, rgb_uuid=None, depth_uuid="depth_lowres", ) self.visual_encoder_drop = SimpleCNN( self.observation_space, self._hidden_size, rgb_uuid=None, depth_uuid="depth_lowres", ) self.state_encoder = RNNStateEncoder( (self._hidden_size) + obj_state_embedding_size, self._hidden_size, trainable_masked_hidden_state=trainable_masked_hidden_state, num_layers=num_rnn_layers, rnn_type=rnn_type, ) self.actor_pick = LinearActorHeadNoCategory(self._hidden_size, action_space.n) self.critic_pick = LinearCriticHead(self._hidden_size) self.actor_drop = LinearActorHeadNoCategory(self._hidden_size, action_space.n) self.critic_drop = LinearCriticHead(self._hidden_size) # self.object_state_embedding = nn.Embedding(num_embeddings=6, embedding_dim=obj_state_embedding_size) relative_dist_embedding_size = torch.Tensor( [3, 100, obj_state_embedding_size]) self.relative_dist_embedding_pick = input_embedding_net( relative_dist_embedding_size.long().tolist(), dropout=0) self.relative_dist_embedding_drop = input_embedding_net( relative_dist_embedding_size.long().tolist(), dropout=0) self.train()
def __init__( self, action_space: gym.spaces.Tuple, observation_space: SpaceDict, rgb_uuid: Optional[str] = "rgb", hidden_size=512, num_rnn_layers=1, rnn_type="GRU", ): super().__init__(action_space=action_space, observation_space=observation_space) self._hidden_size = hidden_size self.rgb_uuid = rgb_uuid self.visual_encoder = SimpleCNN( observation_space=observation_space, output_size=hidden_size, rgb_uuid=self.rgb_uuid, depth_uuid=None, ) self.state_encoder = RNNStateEncoder( 0 if self.is_blind else self.recurrent_hidden_state_size, self._hidden_size, num_layers=num_rnn_layers, rnn_type=rnn_type, ) self.actor_critic = TupleLinearActorCriticHead(self._hidden_size, action_space[0].n) self.train()
def __init__( self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, rgb_uuid: Optional[str], depth_uuid: Optional[str], goal_sensor_uuid: str, hidden_size=512, embed_coordinates=False, coordinate_embedding_dim=8, coordinate_dims=2, num_rnn_layers=1, rnn_type="GRU", ): super().__init__(action_space=action_space, observation_space=observation_space) self.goal_sensor_uuid = goal_sensor_uuid self._hidden_size = hidden_size self.embed_coordinates = embed_coordinates if self.embed_coordinates: self.coordinate_embedding_size = coordinate_embedding_dim else: self.coordinate_embedding_size = coordinate_dims self.sensor_fusion = False if rgb_uuid is not None and depth_uuid is not None: self.sensor_fuser = nn.Linear(hidden_size * 2, hidden_size) self.sensor_fusion = True self.visual_encoder = SimpleCNN( observation_space=observation_space, output_size=hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self.recurrent_hidden_state_size) + self.coordinate_embedding_size, self._hidden_size, num_layers=num_rnn_layers, rnn_type=rnn_type, ) self.actor = LinearActorHead(self._hidden_size, action_space.n) self.critic = LinearCriticHead(self._hidden_size) if self.embed_coordinates: self.coordinate_embedding = nn.Linear(coordinate_dims, coordinate_embedding_dim) self.train()
def __init__( self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, hidden_size=512, obj_state_embedding_size=512, trainable_masked_hidden_state: bool = False, num_rnn_layers=1, rnn_type="GRU", ): """Initializer. See class documentation for parameter definitions. """ super().__init__(action_space=action_space, observation_space=observation_space) self._hidden_size = hidden_size self.object_type_embedding_size = obj_state_embedding_size sensor_names = self.observation_space.spaces.keys() self.visual_encoder = SimpleCNN( self.observation_space, self._hidden_size, rgb_uuid="rgb_lowres" if "rgb_lowres" in sensor_names else None, depth_uuid="depth_lowres" if "depth_lowres" in sensor_names else None, ) if "rgb_lowres" in sensor_names and "depth_lowres" in sensor_names: input_visual_feature_num = 2 elif "rgb_lowres" in sensor_names: input_visual_feature_num = 1 elif "depth_lowres" in sensor_names: input_visual_feature_num = 1 self.state_encoder = RNNStateEncoder( (self._hidden_size) * input_visual_feature_num + obj_state_embedding_size, self._hidden_size, trainable_masked_hidden_state=trainable_masked_hidden_state, num_layers=num_rnn_layers, rnn_type=rnn_type, ) self.actor = LinearActorHead(self._hidden_size, action_space.n) self.critic = LinearCriticHead(self._hidden_size) relative_dist_embedding_size = torch.Tensor( [3, 100, obj_state_embedding_size]) self.relative_dist_embedding = input_embedding_net( relative_dist_embedding_size.long().tolist(), dropout=0) self.train()
def __init__( self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, goal_sensor_uuid: str, rgb_uuid: Optional[str], depth_uuid: Optional[str], hidden_size=512, object_type_embedding_dim=8, trainable_masked_hidden_state: bool = False, num_rnn_layers=1, rnn_type="GRU", ): """Initializer. See class documentation for parameter definitions. """ super().__init__(action_space=action_space, observation_space=observation_space) self.goal_sensor_uuid = goal_sensor_uuid self._n_object_types = self.observation_space.spaces[ self.goal_sensor_uuid].n self._hidden_size = hidden_size self.object_type_embedding_size = object_type_embedding_dim self.visual_encoder = SimpleCNN( observation_space=self.observation_space, output_size=self._hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + object_type_embedding_dim, self._hidden_size, trainable_masked_hidden_state=trainable_masked_hidden_state, num_layers=num_rnn_layers, rnn_type=rnn_type, ) self.actor = LinearActorHead(self._hidden_size, action_space.n) self.critic = LinearCriticHead(self._hidden_size) self.object_type_embedding = nn.Embedding( num_embeddings=self._n_object_types, embedding_dim=object_type_embedding_dim, ) self.train()
def _create_visual_encoder(self) -> nn.Module: img_space: gym.spaces.Box = self.observation_space[self.rgb_uuid] return SimpleCNN( observation_space=gym.spaces.Dict({ self.concat_rgb_uuid: gym.spaces.Box( low=np.tile(img_space.low, (1, 1, 2)), high=np.tile(img_space.high, (1, 1, 2)), shape=img_space.shape[:2] + (img_space.shape[2] * 2, ), ) }), output_size=self._hidden_size, rgb_uuid=self.concat_rgb_uuid, depth_uuid=None, )
def __init__( self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, goal_sensor_uuid: str, # RNN hidden_size=512, num_rnn_layers=1, rnn_type="GRU", add_prev_actions=False, action_embed_size=6, # Aux loss multiple_beliefs=False, beliefs_fusion: Optional[FusionType] = None, auxiliary_uuids: Optional[List[str]] = None, # below are custom params rgb_uuid: Optional[str] = None, depth_uuid: Optional[str] = None, object_type_embedding_dim=8, trainable_masked_hidden_state: bool = False, # perception backbone params, backbone="gnresnet18", resnet_baseplanes=32, ): """Initializer. See class documentation for parameter definitions. """ super().__init__( action_space=action_space, observation_space=observation_space, hidden_size=hidden_size, multiple_beliefs=multiple_beliefs, beliefs_fusion=beliefs_fusion, auxiliary_uuids=auxiliary_uuids, ) self.goal_sensor_uuid = goal_sensor_uuid self._n_object_types = self.observation_space.spaces[self.goal_sensor_uuid].n self.object_type_embedding_size = object_type_embedding_dim self.backbone = backbone if backbone == "simple_cnn": self.visual_encoder = SimpleCNN( observation_space=observation_space, output_size=hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, ) else: # resnet family self.visual_encoder = resnet.GroupNormResNetEncoder( observation_space=observation_space, output_size=hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), ) self.create_state_encoders( obs_embed_size=self.goal_visual_encoder_output_dims, num_rnn_layers=num_rnn_layers, rnn_type=rnn_type, add_prev_actions=add_prev_actions, prev_action_embed_size=action_embed_size, trainable_masked_hidden_state=trainable_masked_hidden_state, ) self.create_actorcritic_head() self.create_aux_models( obs_embed_size=self.goal_visual_encoder_output_dims, action_embed_size=action_embed_size, ) self.object_type_embedding = nn.Embedding( num_embeddings=self._n_object_types, embedding_dim=object_type_embedding_dim, ) self.train()
def __init__( # base params self, action_space: gym.spaces.Discrete, observation_space: SpaceDict, goal_sensor_uuid: str, hidden_size=512, num_rnn_layers=1, rnn_type="GRU", add_prev_actions=False, action_embed_size=4, multiple_beliefs=False, beliefs_fusion: Optional[FusionType] = None, auxiliary_uuids: Optional[List[str]] = None, # custom params rgb_uuid: Optional[str] = None, depth_uuid: Optional[str] = None, embed_coordinates=False, coordinate_embedding_dim=8, coordinate_dims=2, # perception backbone params, backbone="gnresnet18", resnet_baseplanes=32, ): super().__init__( action_space=action_space, observation_space=observation_space, hidden_size=hidden_size, multiple_beliefs=multiple_beliefs, beliefs_fusion=beliefs_fusion, auxiliary_uuids=auxiliary_uuids, ) self.goal_sensor_uuid = goal_sensor_uuid self.embed_coordinates = embed_coordinates if self.embed_coordinates: self.coordinate_embedding_size = coordinate_embedding_dim else: self.coordinate_embedding_size = coordinate_dims self.sensor_fusion = False if rgb_uuid is not None and depth_uuid is not None: self.sensor_fuser = nn.Linear(hidden_size * 2, hidden_size) self.sensor_fusion = True self.backbone = backbone if backbone == "simple_cnn": self.visual_encoder = SimpleCNN( observation_space=observation_space, output_size=hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, ) else: # resnet family self.visual_encoder = resnet.GroupNormResNetEncoder( observation_space=observation_space, output_size=hidden_size, rgb_uuid=rgb_uuid, depth_uuid=depth_uuid, baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), ) if self.embed_coordinates: self.coordinate_embedding = nn.Linear( coordinate_dims, coordinate_embedding_dim ) self.create_state_encoders( obs_embed_size=self.goal_visual_encoder_output_dims, num_rnn_layers=num_rnn_layers, rnn_type=rnn_type, add_prev_actions=add_prev_actions, prev_action_embed_size=action_embed_size, ) self.create_actorcritic_head() self.create_aux_models( obs_embed_size=self.goal_visual_encoder_output_dims, action_embed_size=action_embed_size, ) self.train()