Пример #1
0
    def build(self, mdp_info):
        actor_input_shape = mdp_info.observation_space.shape
        self.actor_mu_params['input_shape'] = actor_input_shape
        self.actor_mu_params['output_shape'] = mdp_info.action_space.shape
        self.actor_sigma_params['input_shape'] = actor_input_shape
        self.actor_sigma_params['output_shape'] = mdp_info.action_space.shape

        critic_input_shape = (actor_input_shape[0] + mdp_info.action_space.shape[0],)
        self.critic_params["input_shape"] = critic_input_shape
        sac = SAC(mdp_info, self.actor_mu_params, self.actor_sigma_params, self.actor_optimizer, self.critic_params,
                  **self.alg_params)
        return sac
Пример #2
0
    def build(self, mdp_info):
        actor_input_shape = mdp_info.observation_space.shape
        self.actor_mu_params['input_shape'] = actor_input_shape
        self.actor_mu_params['output_shape'] = mdp_info.action_space.shape
        self.actor_sigma_params['input_shape'] = actor_input_shape
        self.actor_sigma_params['output_shape'] = mdp_info.action_space.shape

        critic_input_shape = (actor_input_shape[0] +
                              mdp_info.action_space.shape[0], )
        self.critic_params["input_shape"] = critic_input_shape
        sac = SAC(mdp_info, self.actor_mu_params, self.actor_sigma_params,
                  self.actor_optimizer, self.critic_params, **self.alg_params)
        print("TARGET_ENTROPY", sac._target_entropy)
        return sac
Пример #3
0
def learn_sac():

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v0', horizon, gamma)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 10
    tau = 0.005
    lr_alpha = 3e-4

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=False)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = SAC(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=2 * initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    return agent
Пример #4
0
def create_SAC_agent(mdp, use_cuda=None):
    if use_cuda is None:
        use_cuda = torch.cuda.is_available()

    # Settings
    actor_mu_network = ActorNetwork
    actor_sigma_network = ActorNetwork
    network_layers_actor_mu = (512, 256)
    network_layers_actor_sigma = (512, 256)
    network_layers_critic = (512, 256)

    initial_replay_size = 3000
    max_replay_size = 100000
    batch_size = 256
    warmup_transitions = 5000
    tau = 0.005

    lr_alpha = 2e-6
    lr_actor = 2e-5
    lr_critic = 4e-5
    weight_decay_actor = 0.0
    weight_decay_critic = 0.0

    target_entropy = -22.0

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=actor_mu_network,
                           n_features=network_layers_actor_mu,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)

    actor_sigma_params = dict(network=actor_sigma_network,
                              n_features=network_layers_actor_sigma,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}},
                         loss=F.mse_loss,
                         n_features=network_layers_critic,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # create SAC agent
    agent = SAC(mdp_info=mdp.info,
                batch_size=batch_size, initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha,
                actor_mu_params=actor_mu_params, actor_sigma_params=actor_sigma_params,
                actor_optimizer=actor_optimizer, critic_params=critic_params,
                target_entropy=target_entropy, critic_fit_params=None)

    return agent
Пример #5
0
def test_sac():
    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v0', horizon, gamma)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 10
    tau = 0.005
    lr_alpha = 3e-4

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=False)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = SAC(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=2 * initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    w = agent.policy.get_weights()
    w_test = np.array([
        1.6998193, -0.732528, 1.2986078, -0.26860124, 0.5094043, -0.5001421,
        -0.18989229, -0.30646914
    ])

    assert np.allclose(w, w_test)