def make_model(self, env): n_hidden_channels = 20 obs_size = env.observation_space.low.size if self.recurrent: v = StatelessRecurrentSequential( L.NStepLSTM(1, obs_size, n_hidden_channels, 0), L.Linear( None, 1, initialW=chainer.initializers.LeCunNormal(1e-1)), ) if self.discrete: n_actions = env.action_space.n pi = StatelessRecurrentSequential( L.NStepLSTM(1, obs_size, n_hidden_channels, 0), policies.FCSoftmaxPolicy( n_hidden_channels, n_actions, n_hidden_layers=0, nonlinearity=F.tanh, last_wscale=1e-1, ) ) else: action_size = env.action_space.low.size pi = StatelessRecurrentSequential( L.NStepLSTM(1, obs_size, n_hidden_channels, 0), policies.FCGaussianPolicy( n_hidden_channels, action_size, n_hidden_layers=0, nonlinearity=F.tanh, mean_wscale=1e-1, ) ) return StatelessRecurrentBranched(pi, v) else: v = chainer.Sequential( L.Linear(None, n_hidden_channels), F.tanh, L.Linear( None, 1, initialW=chainer.initializers.LeCunNormal(1e-1)), ) if self.discrete: n_actions = env.action_space.n pi = policies.FCSoftmaxPolicy( obs_size, n_actions, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, last_wscale=1e-1, ) else: action_size = env.action_space.low.size pi = policies.FCGaussianPolicy( obs_size, action_size, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, mean_wscale=1e-1, ) return A3CSeparateModel(pi=pi, v=v)
def make_model(self, env): n_hidden_channels = 50 n_dim_obs = env.observation_space.low.size v = v_functions.FCVFunction( n_dim_obs, n_hidden_layers=2, n_hidden_channels=n_hidden_channels) if self.discrete: n_actions = env.action_space.n pi = policies.FCSoftmaxPolicy( n_dim_obs, n_actions, n_hidden_layers=2, n_hidden_channels=n_hidden_channels) else: n_dim_actions = env.action_space.low.size pi = policies.FCGaussianPolicy( n_dim_obs, n_dim_actions, n_hidden_layers=2, n_hidden_channels=n_hidden_channels) return A3CSeparateModel(pi=pi, v=v)
def __init__(self, env, feature_transformer, gamma=0.99, optimizer='adam', max_memory=10000): BaseAgent.__init__(self, env=env, feature_transformer=feature_transformer, gamma=gamma, optimizer=optimizer) self.model = policies.FCSoftmaxPolicy(self.n_dims, self.n_actions, n_hidden_layers=2, n_hidden_channels=100, nonlinearity=F.relu) self.optimizer.setup(self.model) #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40)) self.replay_buffer = PrioritizedEpisodicReplayBuffer( capacity=max_memory, uniform_ratio=0.1, default_priority_func=exp_return_of_episode, wait_priority_after_sampling=False, return_sample_weights=False) self.agent = reinforce.REINFORCE(model=self.model, optimizer=self.optimizer, phi=phi, batchsize=1, act_deterministically=False)
def __init__(self, trial, width, height, action_size, lstm_size=128): obs_size = width * height self.head = MyHead(trial, width=width, height=height) self.lstm = L.LSTM(self.head.n_output_channels, lstm_size) self.pi = policies.FCSoftmaxPolicy(lstm_size, action_size) self.v = v_function.FCVFunction(lstm_size) super().__init__(self.head, self.lstm, self.pi, self.v)
def create_stochastic_policy_for_env(env): assert isinstance(env.observation_space, gym.spaces.Box) ndim_obs = env.observation_space.low.size if isinstance(env.action_space, gym.spaces.Discrete): return policies.FCSoftmaxPolicy(ndim_obs, env.action_space.n) elif isinstance(env.action_space, gym.spaces.Box): return policies.FCGaussianPolicy(ndim_obs, env.action_space.low.size, bound_mean=False) else: raise NotImplementedError()
def __init__(self, env, feature_transformer, gamma=0.99, optimizer='adam', max_memory=10000): BaseAgent.__init__(self, env=env, feature_transformer=feature_transformer, gamma=gamma, optimizer=optimizer) self.model = agents.pcl.PCLSeparateModel( pi=policies.FCSoftmaxPolicy(self.n_dims, self.n_actions, n_hidden_channels=100, n_hidden_layers=2), v=v_functions.FCVFunction( self.n_dims, n_hidden_channels=100, n_hidden_layers=2, ), ) self.optimizer.setup(self.model) #self.optimizer.add_hook(chainer.optimizer.GradientClipping(40)) self.replay_buffer = \ chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=max_memory, uniform_ratio=0.1, default_priority_func=exp_return_of_episode, wait_priority_after_sampling=False, return_sample_weights=False) self.agent = agents.pcl.PCL(model=self.model, optimizer=self.optimizer, replay_buffer=self.replay_buffer, t_max=1, gamma=self.gamma, tau=1e-2, phi=phi, rollout_len=10, batchsize=1, disable_online_update=False, n_times_replay=1, replay_start_size=1000, normalize_loss_by_steps=True, act_deterministically=False, backprop_future_values=False, train_async=True)
def make_model(self, env): n_hidden_channels = 20 n_dim_obs = env.observation_space.low.size v = v_functions.FCVFunction( n_dim_obs, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, last_wscale=0.01, ) if self.discrete: n_actions = env.action_space.n pi = policies.FCSoftmaxPolicy( n_dim_obs, n_actions, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, last_wscale=0.01, ) else: n_dim_actions = env.action_space.low.size pi = policies.FCGaussianPolicyWithStateIndependentCovariance( n_dim_obs, n_dim_actions, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, mean_wscale=0.01, var_type='diagonal', ) # Check if KL div supports double-backprop fake_obs = np.zeros_like(env.observation_space.low, dtype=np.float32) action_distrib = pi(fake_obs[None]) kl = action_distrib.kl(action_distrib) old_style_funcs = trpo._find_old_style_function([kl]) if old_style_funcs: self.skipTest("\ Chainer v{} does not support double backprop of these functions: {}.".format( chainer.__version__, old_style_funcs)) return pi, v
def __init__(self, n_dims, n_actions): self.head = links.Sequence( L.ConvolutionND(ndim=1, in_channels=n_dims, out_channels=100, ksize=3, stride=1, pad=1, cover_all=True), F.relu) self.pi = policies.FCSoftmaxPolicy(n_input_channels=100, n_actions=n_actions, n_hidden_layers=2, n_hidden_channels=100) self.v = v_functions.FCVFunction(n_input_channels=100, n_hidden_layers=2, n_hidden_channels=100) super(A3CFF, self).__init__(self.head, self.pi, self.v)
def __init__(self, ndim_obs, n_discrete_entries, n_hidden_layers=2, n_hidden_channels=400, beta=1.0): self.pi = policies.FCSoftmaxPolicy(ndim_obs, n_discrete_entries, n_hidden_layers, n_hidden_channels, beta=beta) self.v = chainerrl.v_functions.FCVFunction( ndim_obs, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, last_wscale=0.01, ) super().__init__(self.pi, self.v)
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 2 nonlinearity = F.relu if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) eps = 1e-8 if self.backprop_future_values else 1e-1 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 tau = 1e-2 replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer(10**5) agent = pcl.PCL(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, tau=tau, phi=phi, n_times_replay=1, batchsize=self.batchsize, train_async=self.train_async, backprop_future_values=self.backprop_future_values, act_deterministically=True) if self.train_async: with warnings.catch_warnings(record=True) as warns: chainerrl.experiments.train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other # processes after success. Thus here the successful model is loaded # explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) else: agent.process_idx = 0 chainerrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu replay_buffer = EpisodicReplayBuffer(10**4) if use_lstm: if discrete: model = acer.ACERSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: if discrete: model = acer.ACERSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) eps = 1e-8 opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled return agent = acer.ACER(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, phi=phi, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) opt = chainer.optimizers.Adam() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(1)) gamma = 0.8 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def __init__(self, n_actions): self.head = links.NIPSDQNHead() self.pi = policies.FCSoftmaxPolicy(self.head.n_output_channels, n_actions) self.v = v_functions.FCVFunction(self.head.n_output_channels) super().__init__(self.head, self.pi, self.v)
def _test_abc(self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1): def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=True, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu if use_lstm: if discrete: model = chainerrl.links.Sequence( L.LSTM(obs_space.low.size, n_hidden_channels, forget_bias_init=1), policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = chainerrl.links.Sequence( L.LSTM(obs_space.low.size, n_hidden_channels, forget_bias_init=1), policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, )) else: if discrete: model = policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity) else: model = policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, ) if gpu >= 0: chainer.cuda.get_device_from_id(gpu).use() model.to_gpu() opt = optimizers.Adam() opt.setup(model) beta = 1e-2 agent = chainerrl.agents.REINFORCE( model, opt, beta=beta, phi=phi, batchsize=self.batchsize, backward_separately=self.backward_separately, act_deterministically=True, ) chainerrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, train_max_episode_len=2, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=1000000): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) eps = 1e-1 if discrete else 1e-2 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.9 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_runs=5, successful_score=1) # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward self.assertAlmostEqual(total_r, 1) agent.stop_episode()