예제 #1
0
    def _create_networks(env, config):
        """ Creates all networks necessary for SAC.

        These networks have to be created before instantiating this class and
        used in the constructor.

        TODO: Maybe this should be reworked one day...

        Args:
            config: A configuration dictonary.

        Returns:
            A dictonary which contains the networks.
        """
        obs_dim = int(np.prod(env.observation_space.shape))
        action_dim = int(np.prod(env.action_space.shape))
        net_size = config['rl_algorithm_config']['net_size']
        hidden_sizes = [net_size] * config['rl_algorithm_config']['network_depth']
        # hidden_sizes = [net_size, net_size, net_size]
        qf1 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf2 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf1_target = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf2_target = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        policy = TanhGaussianPolicy(
            hidden_sizes=hidden_sizes,
            obs_dim=obs_dim,
            action_dim=action_dim,
        ).to(device=ptu.device)

        clip_value = 1.0
        for p in qf1.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))
        for p in qf2.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))
        for p in policy.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))

        return {'qf1' : qf1, 'qf2' : qf2, 'qf1_target' : qf1_target, 'qf2_target' : qf2_target, 'policy' : policy}
예제 #2
0
class MlpModel(DynamicsModel):
    def __init__(self,
                 env,
                 n_layers=3,
                 hidden_layer_size=64,
                 optimizer_class=optim.Adam,
                 learning_rate=1e-3,
                 reward_weight=1,
                 **kwargs):
        super().__init__(env=env, **kwargs)
        self.env = env
        obs_dim = int(np.prod(env.observation_space.shape))
        action_dim = int(np.prod(env.action_space.shape))

        self.input_dim = obs_dim
        self.action_dim = action_dim
        self.next_obs_dim = obs_dim

        self.n_layers = n_layers
        self.hidden_layer_size = hidden_layer_size
        self.learning_rate = learning_rate
        self.reward_weight = reward_weight

        self.reset()

        self.reward_dim = 1
        #terminal_dim = 1

        self.net = FlattenMlp(
            hidden_sizes=[hidden_layer_size] * n_layers,
            input_size=self.input_dim + self.action_dim,
            output_size=self.next_obs_dim + self.reward_dim,
        )
        self.net_optimizer = optimizer_class(self.net.parameters(),
                                             lr=learning_rate)

    def to(self, device=None):
        if device == None:
            device = ptu.device

        self.net.to(device)

    def _forward(self, state, action):
        output = self.net(state, action)
        next_state = output[:, :-self.reward_dim]
        reward = output[:, -self.reward_dim:]

        terminal = 0
        env_info = {}

        return next_state, reward, terminal, env_info

    def step(self, action):
        action = ptu.from_numpy(action[np.newaxis, :])
        next_state, reward, terminal, env_info = self._forward(
            self.state, action)
        self.state = next_state

        next_state = np.squeeze(ptu.get_numpy(next_state))
        reward = np.squeeze(ptu.get_numpy(reward))

        return next_state, reward, terminal, env_info

    def train(self, paths):
        states = ptu.from_numpy(paths["observations"])
        actions = ptu.from_numpy(paths["actions"])
        rewards = ptu.from_numpy(paths["rewards"])
        next_states = ptu.from_numpy(paths["next_observations"])
        terminals = paths["terminals"]

        next_state_preds, reward_preds, terminal_preds, env_infos = self._forward(
            states, actions)
        self.net_optimizer.zero_grad()

        self.transition_model_loss = torch.mean(
            (next_state_preds - next_states)**2)
        self.reward_model_loss = torch.mean((reward_preds - rewards)**2)
        self.net_loss = self.transition_model_loss + self.reward_weight * self.reward_model_loss
        self.net_loss.backward()
        self.net_optimizer.step()