Exemplo n.º 1
0
 def test_needs_reset(self):
     # MagicMock can mock eval_mode while Mock cannot
     agent = mock.MagicMock()
     env = mock.Mock()
     # First episode: 0 -> 1 -> 2 -> 3 (reset)
     # Second episode: 4 -> 5 -> 6 -> 7 (done)
     env.reset.side_effect = [("state", 0), ("state", 4)]
     env.step.side_effect = [
         (("state", 1), 0, False, {}),
         (("state", 2), 0, False, {}),
         (("state", 3), 0, False, {
             "needs_reset": True
         }),
         (("state", 5), -0.5, False, {}),
         (("state", 6), 0, False, {}),
         (("state", 7), 1, True, {}),
     ]
     scores = evaluator.run_evaluation_episodes(env,
                                                agent,
                                                n_steps=None,
                                                n_episodes=2)
     assert len(scores) == 2
     np.testing.assert_allclose(scores[0], 0)
     np.testing.assert_allclose(scores[1], 0.5)
     assert agent.act.call_count == 6
     assert agent.observe.call_count == 6
Exemplo n.º 2
0
def test_run_evaluation_episodes_with_n_steps(n_episodes, n_steps):
    # MagicMock can mock eval_mode while Mock cannot
    agent = mock.MagicMock()
    env = mock.Mock()
    # First episode: 0 -> 1 -> 2 -> 3 (reset)
    # Second episode: 4 -> 5 -> 6 -> 7 (done)
    env.reset.side_effect = [("state", 0), ("state", 4)]
    env.step.side_effect = [
        (("state", 1), 0.1, False, {}),
        (("state", 2), 0.2, False, {}),
        (("state", 3), 0.3, False, {"needs_reset": True}),
        (("state", 5), -0.5, False, {}),
        (("state", 6), 0, False, {}),
        (("state", 7), 1, True, {}),
    ]

    if n_episodes:
        with pytest.raises(AssertionError):
            scores, lengths = evaluator.run_evaluation_episodes(
                env, agent, n_steps=n_steps, n_episodes=n_episodes
            )
    else:
        scores, lengths = evaluator.run_evaluation_episodes(
            env, agent, n_steps=n_steps, n_episodes=n_episodes
        )
        assert agent.act.call_count == n_steps
        assert agent.observe.call_count == n_steps
        if n_steps == 2:
            assert len(scores) == 1
            assert len(lengths) == 1
            np.testing.assert_allclose(scores[0], 0.3)
            np.testing.assert_allclose(lengths[0], 2)
        elif n_steps == 5:
            assert len(scores) == 1
            assert len(lengths) == 1
            np.testing.assert_allclose(scores[0], 0.6)
            np.testing.assert_allclose(lengths[0], 3)
        else:
            assert len(scores) == 2
            assert len(lengths) == 2
            np.testing.assert_allclose(scores[0], 0.6)
            np.testing.assert_allclose(scores[1], 0.5)
            np.testing.assert_allclose(lengths[0], 3)
            np.testing.assert_allclose(lengths[1], 3)
Exemplo n.º 3
0
    def _test_abc(self,
                  steps=100000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print("Load agent from", self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        # Test
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs

        # Save
        agent.save(self.agent_dirname)
Exemplo n.º 4
0
    def _test_training(self,
                       gpu,
                       steps=5000,
                       load_model=False,
                       require_success=True):

        random_seed.set_random_seed(1)
        logging.basicConfig(level=logging.DEBUG)

        env = self.make_env_and_successful_return(test=False)[0]
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)
        agent = self.make_agent(env, gpu)

        if load_model:
            print("Load agent from", self.agent_dirname)
            agent.load(self.agent_dirname)
            agent.replay_buffer.load(self.rbuf_filename)

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=1,
            eval_env=test_env,
        )

        # Test
        n_test_runs = 5
        eval_returns, _ = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
        )
        n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
        if require_success:
            assert n_succeeded == n_test_runs

        # Save
        agent.save(self.agent_dirname)
        agent.replay_buffer.save(self.rbuf_filename)
Exemplo n.º 5
0
    def _test_abc(
        self,
        t_max,
        recurrent,
        discrete=True,
        episodic=True,
        steps=100000,
        require_success=True,
    ):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=episodic or test,
                partially_observable=self.recurrent,
                deterministic=test,
            )

        env = make_env(0, False)

        model = self.make_model(env)

        from pfrl.optimizers import SharedRMSpropEpsInsideSqrt

        opt = SharedRMSpropEpsInsideSqrt(model.parameters())
        gamma = 0.8
        beta = 1e-2
        agent = a3c.A3C(
            model,
            opt,
            t_max=t_max,
            gamma=gamma,
            beta=beta,
            act_deterministically=True,
            max_grad_norm=1.0,
            recurrent=recurrent,
        )

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(
                outdir=self.outdir,
                processes=nproc,
                make_env=make_env,
                agent=agent,
                steps=steps,
                max_episode_len=max_episode_len,
                eval_interval=500,
                eval_n_steps=None,
                eval_n_episodes=5,
                successful_score=1,
            )
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, "successful"))

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        successful_return = 1
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs
Exemplo n.º 6
0
    def _test_abc(
        self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1
    ):
        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=True,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        hidden_size = 20
        obs_size = obs_space.low.size
        if discrete:
            output_size = action_space.n
            head = SoftmaxCategoricalHead()
        else:
            output_size = action_space.low.size
            head = GaussianHeadWithStateIndependentCovariance(
                output_size, var_type="diagonal"
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.LSTM(
                    num_layers=1,
                    input_size=obs_size,
                    hidden_size=hidden_size,
                ),
                nn.Linear(hidden_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        opt = torch.optim.Adam(model.parameters())
        beta = 1e-2
        agent = pfrl.agents.REINFORCE(
            model,
            opt,
            gpu=gpu,
            beta=beta,
            batchsize=self.batchsize,
            backward_separately=self.backward_separately,
            act_deterministically=True,
            recurrent=use_lstm,
        )

        pfrl.experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(0, False),
            eval_env=make_env(0, True),
            outdir=self.outdir,
            steps=steps,
            train_max_episode_len=2,
            eval_interval=500,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=1,
        )

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns, _ = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
        )
        if require_success:
            successful_return = 1
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs
Exemplo n.º 7
0
    def _test_abc(
        self,
        t_max,
        use_lstm,
        discrete=True,
        episodic=True,
        steps=100000,
        require_success=True,
    ):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=episodic or test,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        replay_buffer = EpisodicReplayBuffer(10**4)
        obs_size = obs_space.low.size
        hidden_size = 20
        if discrete:
            n_actions = action_space.n
            head = acer.ACERDiscreteActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    SoftmaxCategoricalHead(),
                ),
                q=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    DiscreteActionValueHead(),
                ),
            )
        else:
            action_size = action_space.low.size
            head = acer.ACERContinuousActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, action_size * 2),
                    GaussianHeadWithDiagonalCovariance(),
                ),
                v=nn.Sequential(nn.Linear(hidden_size, 1), ),
                adv=nn.Sequential(
                    ConcatObsAndAction(),
                    nn.Linear(hidden_size + action_size, 1),
                ),
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.LSTM(num_layers=1,
                        input_size=hidden_size,
                        hidden_size=hidden_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                head,
            )
        eps = 1e-8
        opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(),
                                                         lr=1e-3,
                                                         eps=eps,
                                                         alpha=0.99)
        gamma = 0.5
        beta = 1e-5
        if self.n_times_replay == 0 and self.disable_online_update:
            # At least one of them must be enabled
            pytest.skip()
        agent = acer.ACER(
            model,
            opt,
            replay_buffer=replay_buffer,
            t_max=t_max,
            gamma=gamma,
            beta=beta,
            n_times_replay=self.n_times_replay,
            act_deterministically=True,
            disable_online_update=self.disable_online_update,
            replay_start_size=100,
            use_trust_region=self.use_trust_region,
            recurrent=use_lstm,
        )

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(
                outdir=self.outdir,
                processes=nproc,
                make_env=make_env,
                agent=agent,
                steps=steps,
                max_episode_len=max_episode_len,
                eval_interval=500,
                eval_n_steps=None,
                eval_n_episodes=5,
                successful_score=1,
            )
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, "successful"))

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        successful_return = 1
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs