def make_model(self, env): n_dim_obs = env.observation_space.low.size n_dim_action = env.action_space.low.size n_hidden_channels = 50 policy = policies.FCGaussianPolicy(n_input_channels=n_dim_obs, n_hidden_layers=2, n_hidden_channels=n_hidden_channels, action_size=n_dim_action, min_action=env.action_space.low, max_action=env.action_space.high) q_func = q_function.FCSAQFunction(n_dim_obs=n_dim_obs, n_dim_action=n_dim_action, n_hidden_layers=2, n_hidden_channels=n_hidden_channels) return chainer.Chain(policy=policy, q_function=q_func)
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu replay_buffer = EpisodicReplayBuffer(10**4) if use_lstm: if discrete: model = acer.ACERSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: if discrete: model = acer.ACERSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) eps = 1e-8 opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled return agent = acer.ACER(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, phi=phi, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()