def __init__( self, latent_dim, layers, # action_dim, action_embed_dim, state_dim, state_embed_dim, pred_type='deterministic'): super(StateTransitionDecoder, self).__init__() self.state_encoder = utl.FeatureExtractor(state_dim, state_embed_dim, F.relu) self.action_encoder = utl.FeatureExtractor(action_dim, action_embed_dim, F.relu) curr_input_dim = latent_dim + state_embed_dim + action_embed_dim self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_dim, layers[i])) curr_input_dim = layers[i] # output layer if pred_type == 'gaussian': self.fc_out = nn.Linear(curr_input_dim, 2 * state_dim) else: self.fc_out = nn.Linear(curr_input_dim, state_dim)
def __init__( self, task_embedding_size, layers, # action_size, action_embed_size, state_size, state_embed_size, pred_type='deterministic'): super(StateTransitionDecoder, self).__init__() self.state_encoder = utl.FeatureExtractor(state_size, state_embed_size, F.relu) self.action_encoder = utl.FeatureExtractor(action_size, action_embed_size, F.relu) curr_input_size = task_embedding_size + state_embed_size + action_embed_size self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_size, layers[i])) curr_input_size = layers[i] # output layer outsize = state_size - 1 if pred_type == 'gaussian': self.fc_out = nn.Linear(curr_input_size, 2 * outsize) else: self.fc_out = nn.Linear(curr_input_size, outsize)
def __init__( self, args, layers, latent_dim, action_dim, action_embed_dim, state_dim, state_embed_dim, num_states, multi_head=False, pred_type='deterministic', input_prev_state=True, input_action=True, ): super(RewardDecoder, self).__init__() self.args = args self.pred_type = pred_type self.multi_head = multi_head self.input_prev_state = input_prev_state self.input_action = input_action if self.multi_head: # one output head per state to predict rewards curr_input_dim = latent_dim self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_dim, layers[i])) curr_input_dim = layers[i] self.fc_out = nn.Linear(curr_input_dim, num_states) else: # get state as input and predict reward prob self.state_encoder = utl.FeatureExtractor(state_dim, state_embed_dim, F.relu) if self.input_action: self.action_encoder = utl.FeatureExtractor( action_dim, action_embed_dim, F.relu) else: self.action_encoder = None curr_input_dim = latent_dim + state_embed_dim if input_prev_state: curr_input_dim += state_embed_dim if input_action: curr_input_dim += action_embed_dim self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_dim, layers[i])) curr_input_dim = layers[i] if pred_type == 'gaussian': self.fc_out = nn.Linear(curr_input_dim, 2) else: self.fc_out = nn.Linear(curr_input_dim, 1)
def __init__(self, # network size layers_before_gru=(), hidden_size=64, layers_after_gru=(), latent_dim=32, # actions, states, rewards action_dim=2, action_embed_dim=10, state_dim=2, state_embed_dim=10, reward_size=1, reward_embed_size=5, ): super(RNNEncoder, self).__init__() self.latent_dim = latent_dim self.hidden_size = hidden_size self.reparameterise = self._sample_gaussian # embed action, state, reward self.state_encoder = utl.FeatureExtractor(state_dim, state_embed_dim, F.relu) self.action_encoder = utl.FeatureExtractor(action_dim, action_embed_dim, F.relu) self.reward_encoder = utl.FeatureExtractor(reward_size, reward_embed_size, F.relu) # fully connected layers before the recurrent cell curr_input_dim = action_embed_dim + state_embed_dim + reward_embed_size self.fc_before_gru = nn.ModuleList([]) for i in range(len(layers_before_gru)): self.fc_before_gru.append(nn.Linear(curr_input_dim, layers_before_gru[i])) curr_input_dim = layers_before_gru[i] # recurrent unit # TODO: TEST RNN vs GRU vs LSTM self.gru = nn.GRU(input_size=curr_input_dim, hidden_size=hidden_size, num_layers=1, ) for name, param in self.gru.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0) elif 'weight' in name: nn.init.orthogonal_(param) # fully connected layers after the recurrent cell curr_input_dim = hidden_size self.fc_after_gru = nn.ModuleList([]) for i in range(len(layers_after_gru)): self.fc_after_gru.append(nn.Linear(curr_input_dim, layers_after_gru[i])) curr_input_dim = layers_after_gru[i] # output layer self.fc_mu = nn.Linear(curr_input_dim, latent_dim) self.fc_logvar = nn.Linear(curr_input_dim, latent_dim)
def __init__( self, layers, task_embedding_size, action_size, action_embed_size, state_size, state_embed_size, num_states, multi_head=False, pred_type='deterministic', input_prev_state=True, input_next_state=True, input_action=True, ): super(RewardDecoder, self).__init__() self.pred_type = pred_type self.multi_head = multi_head self.input_prev_state = input_prev_state self.input_next_state = input_next_state self.input_action = input_action if self.multi_head: # one output head per state to predict rewards curr_input_size = task_embedding_size self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_size, layers[i])) curr_input_size = layers[i] self.fc_out = nn.Linear(curr_input_size, num_states) else: # get state as input and predict reward prob self.state_encoder = utl.FeatureExtractor(state_size, state_embed_size, F.relu) self.action_encoder = utl.FeatureExtractor(action_size, action_embed_size, F.relu) curr_input_size = task_embedding_size if input_next_state: curr_input_size += state_embed_size if input_prev_state: curr_input_size += state_embed_size if input_action: curr_input_size += action_embed_size self.fc_layers = nn.ModuleList([]) for i in range(len(layers)): self.fc_layers.append(nn.Linear(curr_input_size, layers[i])) curr_input_size = layers[i] if pred_type == 'gaussian': self.fc_out = nn.Linear(curr_input_size, 2) else: self.fc_out = nn.Linear(curr_input_size, 1)
def __init__( self, args, # input pass_state_to_policy, pass_latent_to_policy, pass_belief_to_policy, pass_task_to_policy, dim_state, dim_latent, dim_belief, dim_task, # hidden hidden_layers, activation_function, # tanh, relu, leaky-relu policy_initialisation, # orthogonal / normc # output action_space, init_std, norm_actions_of_policy, action_low, action_high, ): """ The policy can get any of these as input: - state (given by environment) - task (in the (belief) oracle setting) - latent variable (from VAE) """ super(Policy, self).__init__() self.args = args if activation_function == 'tanh': self.activation_function = nn.Tanh() elif activation_function == 'relu': self.activation_function = nn.ReLU() elif activation_function == 'leaky-relu': self.activation_function = nn.LeakyReLU() else: raise ValueError if policy_initialisation == 'normc': init_ = lambda m: init(m, init_normc_, lambda x: nn.init.constant_( x, 0), nn.init.calculate_gain(activation_function)) elif policy_initialisation == 'orthogonal': init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain(activation_function)) self.pass_state_to_policy = pass_state_to_policy self.pass_latent_to_policy = pass_latent_to_policy self.pass_task_to_policy = pass_task_to_policy self.pass_belief_to_policy = pass_belief_to_policy # set normalisation parameters for the inputs # (will be updated from outside using the RL batches) self.norm_state = self.args.norm_state_for_policy and (dim_state is not None) if self.pass_state_to_policy and self.norm_state: self.state_rms = utl.RunningMeanStd(shape=(dim_state)) self.norm_latent = self.args.norm_latent_for_policy and (dim_latent is not None) if self.pass_latent_to_policy and self.norm_latent: self.latent_rms = utl.RunningMeanStd(shape=(dim_latent)) self.norm_belief = self.args.norm_belief_for_policy and (dim_task is not None) if self.pass_belief_to_policy and self.norm_belief: self.belief_rms = utl.RunningMeanStd(shape=(dim_belief)) self.norm_task = self.args.norm_task_for_policy and (dim_belief is not None) if self.pass_task_to_policy and self.norm_task: self.task_rms = utl.RunningMeanStd(shape=(dim_task)) curr_input_dim = dim_state * int(self.pass_state_to_policy) + \ dim_latent * int(self.pass_latent_to_policy) + \ dim_belief * int(self.pass_belief_to_policy) + \ dim_task * int(self.pass_task_to_policy) # initialise encoders for separate inputs self.use_state_encoder = self.args.policy_state_embedding_dim is not None if self.pass_state_to_policy and self.use_state_encoder: self.state_encoder = utl.FeatureExtractor( dim_state, self.args.policy_state_embedding_dim, self.activation_function) curr_input_dim = curr_input_dim - dim_state + self.args.policy_state_embedding_dim self.use_latent_encoder = self.args.policy_latent_embedding_dim is not None if self.pass_latent_to_policy and self.use_latent_encoder: self.latent_encoder = utl.FeatureExtractor( dim_latent, self.args.policy_latent_embedding_dim, self.activation_function) curr_input_dim = curr_input_dim - dim_latent + self.args.policy_latent_embedding_dim self.use_belief_encoder = self.args.policy_belief_embedding_dim is not None if self.pass_belief_to_policy and self.use_belief_encoder: self.belief_encoder = utl.FeatureExtractor( dim_belief, self.args.policy_belief_embedding_dim, self.activation_function) curr_input_dim = curr_input_dim - dim_belief + self.args.policy_belief_embedding_dim self.use_task_encoder = self.args.policy_task_embedding_dim is not None if self.pass_task_to_policy and self.use_task_encoder: self.task_encoder = utl.FeatureExtractor( dim_task, self.args.policy_task_embedding_dim, self.activation_function) curr_input_dim = curr_input_dim - dim_task + self.args.policy_task_embedding_dim # initialise actor and critic hidden_layers = [int(h) for h in hidden_layers] self.actor_layers = nn.ModuleList() self.critic_layers = nn.ModuleList() for i in range(len(hidden_layers)): fc = init_(nn.Linear(curr_input_dim, hidden_layers[i])) self.actor_layers.append(fc) fc = init_(nn.Linear(curr_input_dim, hidden_layers[i])) self.critic_layers.append(fc) curr_input_dim = hidden_layers[i] self.critic_linear = nn.Linear(hidden_layers[-1], 1) # output distributions of the policy if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(hidden_layers[-1], num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian( hidden_layers[-1], num_outputs, init_std, min_std=1e-6, action_low=action_low, action_high=action_high, norm_actions_of_policy=norm_actions_of_policy) elif action_space.__class__.__name__ == "MultiDiscrete": num_outputs = action_space.nvec[0] self.dist = Multinomial(hidden_layers[-1], num_outputs, action_space.shape[0]) else: raise NotImplementedError
def __init__( self, state_dim, action_space, init_std, hidden_layers, activation_function, action_low, action_high, normalise_actions, min_std=1e-6, use_task_encoder=False, state_embed_dim=None, task_dim=0, latent_dim=0, ): super(Policy, self).__init__() hidden_layers = [int(h) for h in hidden_layers] curr_input_dim = state_dim # output distributions of the policy if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(hidden_layers[-1], num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(hidden_layers[-1], num_outputs, init_std, min_std, action_low=action_low, action_high=action_high, normalise_actions=normalise_actions) else: raise NotImplementedError if activation_function == 'tanh': self.activation_function = nn.Tanh() elif activation_function == 'relu': self.activation_function = nn.ReLU() elif activation_function == 'leaky-relu': self.activation_function = nn.LeakyReLU() else: raise ValueError # initialise task encoder (for the oracle) self.use_task_encoder = use_task_encoder self.task_dim = task_dim self.latent_dim = latent_dim if self.use_task_encoder: self.task_encoder = utl.FeatureExtractor(self.task_dim, self.latent_dim, self.activation_function) self.state_encoder = utl.FeatureExtractor( state_dim - self.task_dim, state_embed_dim, self.activation_function) curr_input_dim = state_embed_dim + latent_dim # initialise actor and critic init_ = lambda m: init(m, init_normc_, lambda x: nn.init.constant_( x, 0)) self.actor_layers = nn.ModuleList() self.critic_layers = nn.ModuleList() for i in range(len(hidden_layers)): fc = init_(nn.Linear(curr_input_dim, hidden_layers[i])) self.actor_layers.append(fc) fc = init_(nn.Linear(curr_input_dim, hidden_layers[i])) self.critic_layers.append(fc) curr_input_dim = hidden_layers[i] self.critic_linear = nn.Linear(hidden_layers[-1], 1)