def build(self, mdp_info): policy = GaussianTorchPolicy(Network, mdp_info.observation_space.shape, mdp_info.action_space.shape, **self.policy_params) self.critic_params["input_shape"] = mdp_info.observation_space.shape self.alg_params['critic_params'] = self.critic_params self.alg_params['actor_optimizer'] = self.actor_optimizer return A2C(mdp_info, policy, **self.alg_params)
def test_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) w = agent.policy.get_weights() w_test = np.array( [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843]) assert np.allclose(w, w_test)
def learn_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy policy_params = dict(n_features=32, use_cuda=False) beta = Parameter(1e0) pi = BoltzmannTorchPolicy(Network, mdp.info.observation_space.shape, (mdp.info.action_space.n, ), beta=beta, **policy_params) # Agent critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 1e-3, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params = dict( actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 1e-3, 'eps': 3e-3 } }, critic_params=critic_params, #max_grad_norm=10.0, ent_coeff=0.01) agent = A2C(mdp.info, pi, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) # RUN dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in trange(n_epochs): tqdm.write('Epoch: ' + str(n)) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) tqdm.write('J: ' + str(np.mean(J))) # core.evaluate(n_episodes=2, render=True) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)