def build(self, mdp_info): actor_input_shape = mdp_info.observation_space.shape self.actor_mu_params['input_shape'] = actor_input_shape self.actor_mu_params['output_shape'] = mdp_info.action_space.shape self.actor_sigma_params['input_shape'] = actor_input_shape self.actor_sigma_params['output_shape'] = mdp_info.action_space.shape critic_input_shape = (actor_input_shape[0] + mdp_info.action_space.shape[0],) self.critic_params["input_shape"] = critic_input_shape sac = SAC(mdp_info, self.actor_mu_params, self.actor_sigma_params, self.actor_optimizer, self.critic_params, **self.alg_params) return sac
def build(self, mdp_info): actor_input_shape = mdp_info.observation_space.shape self.actor_mu_params['input_shape'] = actor_input_shape self.actor_mu_params['output_shape'] = mdp_info.action_space.shape self.actor_sigma_params['input_shape'] = actor_input_shape self.actor_sigma_params['output_shape'] = mdp_info.action_space.shape critic_input_shape = (actor_input_shape[0] + mdp_info.action_space.shape[0], ) self.critic_params["input_shape"] = critic_input_shape sac = SAC(mdp_info, self.actor_mu_params, self.actor_sigma_params, self.actor_optimizer, self.critic_params, **self.alg_params) print("TARGET_ENTROPY", sac._target_entropy) return sac
def learn_sac(): # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 10 tau = 0.005 lr_alpha = 3e-4 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = SAC(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=2 * initial_replay_size, n_steps_per_fit=initial_replay_size) return agent
def create_SAC_agent(mdp, use_cuda=None): if use_cuda is None: use_cuda = torch.cuda.is_available() # Settings actor_mu_network = ActorNetwork actor_sigma_network = ActorNetwork network_layers_actor_mu = (512, 256) network_layers_actor_sigma = (512, 256) network_layers_critic = (512, 256) initial_replay_size = 3000 max_replay_size = 100000 batch_size = 256 warmup_transitions = 5000 tau = 0.005 lr_alpha = 2e-6 lr_actor = 2e-5 lr_critic = 4e-5 weight_decay_actor = 0.0 weight_decay_critic = 0.0 target_entropy = -22.0 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=actor_mu_network, n_features=network_layers_actor_mu, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=actor_sigma_network, n_features=network_layers_actor_sigma, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}}, loss=F.mse_loss, n_features=network_layers_critic, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # create SAC agent agent = SAC(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_mu_params=actor_mu_params, actor_sigma_params=actor_sigma_params, actor_optimizer=actor_optimizer, critic_params=critic_params, target_entropy=target_entropy, critic_fit_params=None) return agent
def test_sac(): # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 10 tau = 0.005 lr_alpha = 3e-4 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = SAC(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=2 * initial_replay_size, n_steps_per_fit=initial_replay_size) w = agent.policy.get_weights() w_test = np.array([ 1.6998193, -0.732528, 1.2986078, -0.26860124, 0.5094043, -0.5001421, -0.18989229, -0.30646914 ]) assert np.allclose(w, w_test)