def test_needs_reset(self): # MagicMock can mock eval_mode while Mock cannot agent = mock.MagicMock() env = mock.Mock() # First episode: 0 -> 1 -> 2 -> 3 (reset) # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ (("state", 1), 0, False, {}), (("state", 2), 0, False, {}), (("state", 3), 0, False, { "needs_reset": True }), (("state", 5), -0.5, False, {}), (("state", 6), 0, False, {}), (("state", 7), 1, True, {}), ] scores = evaluator.run_evaluation_episodes(env, agent, n_steps=None, n_episodes=2) assert len(scores) == 2 np.testing.assert_allclose(scores[0], 0) np.testing.assert_allclose(scores[1], 0.5) assert agent.act.call_count == 6 assert agent.observe.call_count == 6
def test_run_evaluation_episodes_with_n_steps(n_episodes, n_steps): # MagicMock can mock eval_mode while Mock cannot agent = mock.MagicMock() env = mock.Mock() # First episode: 0 -> 1 -> 2 -> 3 (reset) # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ (("state", 1), 0.1, False, {}), (("state", 2), 0.2, False, {}), (("state", 3), 0.3, False, {"needs_reset": True}), (("state", 5), -0.5, False, {}), (("state", 6), 0, False, {}), (("state", 7), 1, True, {}), ] if n_episodes: with pytest.raises(AssertionError): scores, lengths = evaluator.run_evaluation_episodes( env, agent, n_steps=n_steps, n_episodes=n_episodes ) else: scores, lengths = evaluator.run_evaluation_episodes( env, agent, n_steps=n_steps, n_episodes=n_episodes ) assert agent.act.call_count == n_steps assert agent.observe.call_count == n_steps if n_steps == 2: assert len(scores) == 1 assert len(lengths) == 1 np.testing.assert_allclose(scores[0], 0.3) np.testing.assert_allclose(lengths[0], 2) elif n_steps == 5: assert len(scores) == 1 assert len(lengths) == 1 np.testing.assert_allclose(scores[0], 0.6) np.testing.assert_allclose(lengths[0], 3) else: assert len(scores) == 2 assert len(lengths) == 2 np.testing.assert_allclose(scores[0], 0.6) np.testing.assert_allclose(scores[1], 0.5) np.testing.assert_allclose(lengths[0], 3) np.testing.assert_allclose(lengths[1], 3)
def _test_abc(self, steps=100000, require_success=True, gpu=-1, load_model=False): env, _ = self.make_env_and_successful_return(test=False) test_env, successful_return = self.make_env_and_successful_return( test=True) agent = self.make_agent(env, gpu) if load_model: print("Load agent from", self.agent_dirname) agent.load(self.agent_dirname) max_episode_len = None if self.episodic else 2 # Train train_agent_with_evaluation( agent=agent, env=env, eval_env=test_env, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=successful_return, train_max_episode_len=max_episode_len, ) # Test n_test_runs = 5 eval_returns = run_evaluation_episodes( test_env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs # Save agent.save(self.agent_dirname)
def _test_training(self, gpu, steps=5000, load_model=False, require_success=True): random_seed.set_random_seed(1) logging.basicConfig(level=logging.DEBUG) env = self.make_env_and_successful_return(test=False)[0] test_env, successful_return = self.make_env_and_successful_return( test=True) agent = self.make_agent(env, gpu) if load_model: print("Load agent from", self.agent_dirname) agent.load(self.agent_dirname) agent.replay_buffer.load(self.rbuf_filename) # Train train_agent_with_evaluation( agent=agent, env=env, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=1, eval_env=test_env, ) # Test n_test_runs = 5 eval_returns, _ = run_evaluation_episodes( test_env, agent, n_steps=None, n_episodes=n_test_runs, ) n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) if require_success: assert n_succeeded == n_test_runs # Save agent.save(self.agent_dirname) agent.replay_buffer.save(self.rbuf_filename)
def _test_abc( self, t_max, recurrent, discrete=True, episodic=True, steps=100000, require_success=True, ): nproc = 8 def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.recurrent, deterministic=test, ) env = make_env(0, False) model = self.make_model(env) from pfrl.optimizers import SharedRMSpropEpsInsideSqrt opt = SharedRMSpropEpsInsideSqrt(model.parameters()) gamma = 0.8 beta = 1e-2 agent = a3c.A3C( model, opt, t_max=t_max, gamma=gamma, beta=beta, act_deterministically=True, max_grad_norm=1.0, recurrent=recurrent, ) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async( outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, "successful")) # Test env = make_env(0, True) n_test_runs = 5 eval_returns = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) successful_return = 1 if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs
def _test_abc( self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1 ): def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=True, partially_observable=self.use_lstm, deterministic=test, ) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space hidden_size = 20 obs_size = obs_space.low.size if discrete: output_size = action_space.n head = SoftmaxCategoricalHead() else: output_size = action_space.low.size head = GaussianHeadWithStateIndependentCovariance( output_size, var_type="diagonal" ) if use_lstm: model = pfrl.nn.RecurrentSequential( nn.LSTM( num_layers=1, input_size=obs_size, hidden_size=hidden_size, ), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(), nn.Linear(hidden_size, output_size), head, ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), nn.Linear(hidden_size, output_size), head, ) opt = torch.optim.Adam(model.parameters()) beta = 1e-2 agent = pfrl.agents.REINFORCE( model, opt, gpu=gpu, beta=beta, batchsize=self.batchsize, backward_separately=self.backward_separately, act_deterministically=True, recurrent=use_lstm, ) pfrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, train_max_episode_len=2, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) # Test env = make_env(0, True) n_test_runs = 5 eval_returns, _ = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, ) if require_success: successful_return = 1 n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs
def _test_abc( self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True, ): nproc = 8 def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test, ) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space replay_buffer = EpisodicReplayBuffer(10**4) obs_size = obs_space.low.size hidden_size = 20 if discrete: n_actions = action_space.n head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(hidden_size, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(hidden_size, n_actions), DiscreteActionValueHead(), ), ) else: action_size = action_space.low.size head = acer.ACERContinuousActionHead( pi=nn.Sequential( nn.Linear(hidden_size, action_size * 2), GaussianHeadWithDiagonalCovariance(), ), v=nn.Sequential(nn.Linear(hidden_size, 1), ), adv=nn.Sequential( ConcatObsAndAction(), nn.Linear(hidden_size + action_size, 1), ), ) if use_lstm: model = pfrl.nn.RecurrentSequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), nn.LSTM(num_layers=1, input_size=hidden_size, hidden_size=hidden_size), head, ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), head, ) eps = 1e-8 opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=1e-3, eps=eps, alpha=0.99) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled pytest.skip() agent = acer.ACER( model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region, recurrent=use_lstm, ) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async( outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, "successful")) # Test env = make_env(0, True) n_test_runs = 5 eval_returns = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) successful_return = 1 if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs