Exemplo n.º 1
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        n_hidden_layers = 1
        nonlinearity = F.leaky_relu
        replay_buffer = EpisodicReplayBuffer(10**4)
        if use_lstm:
            if discrete:
                model = acer.ACERSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        min_prob=1e-1),
                    q=q_function.FCStateQFunctionWithDiscreteAction(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
            else:
                model = acer.ACERSDNSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        nonlinearity=nonlinearity,
                        min_var=1e-1),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                    adv=q_function.FCSAQFunction(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
        else:
            if discrete:
                model = acer.ACERSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity,
                        min_prob=1e-1),
                    q=q_function.FCStateQFunctionWithDiscreteAction(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
            else:
                model = acer.ACERSDNSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high,
                        nonlinearity=nonlinearity,
                        min_var=1e-1),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                    adv=q_function.FCSAQFunction(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=n_hidden_layers,
                        nonlinearity=nonlinearity),
                )
        eps = 1e-8
        opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.5
        beta = 1e-5
        if self.n_times_replay == 0 and self.disable_online_update:
            # At least one of them must be enabled
            return
        agent = acer.ACER(model,
                          opt,
                          replay_buffer=replay_buffer,
                          t_max=t_max,
                          gamma=gamma,
                          beta=beta,
                          phi=phi,
                          n_times_replay=self.n_times_replay,
                          act_deterministically=True,
                          disable_online_update=self.disable_online_update,
                          replay_start_size=100,
                          use_trust_region=self.use_trust_region)

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(outdir=self.outdir,
                              processes=nproc,
                              make_env=make_env,
                              agent=agent,
                              steps=steps,
                              max_episode_len=max_episode_len,
                              eval_interval=500,
                              eval_n_steps=None,
                              eval_n_episodes=5,
                              successful_score=1)
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Exemplo n.º 2
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        opt = chainer.optimizers.Adam()
        opt.setup(model)
        opt.add_hook(chainer.optimizer.GradientClipping(1))
        gamma = 0.8
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(outdir=self.outdir,
                              processes=nproc,
                              make_env=make_env,
                              agent=agent,
                              steps=steps,
                              max_episode_len=max_episode_len,
                              eval_interval=500,
                              eval_n_steps=None,
                              eval_n_episodes=5,
                              successful_score=1)
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Exemplo n.º 3
0
    def _test_abc(self, steps=100000, require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            return ABC(episodic=self.episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space
        ndim_obs = obs_space.low.size
        n_actions = action_space.n

        def random_action_func():
            return np.random.randint(n_actions)

        def make_agent(process_idx):
            n_hidden_channels = 50
            if self.use_lstm:
                q_func = FCLSTMStateQFunction(
                    ndim_obs,
                    n_actions,
                    n_hidden_channels=n_hidden_channels,
                    n_hidden_layers=2)
            else:
                q_func = FCStateQFunctionWithDiscreteAction(
                    ndim_obs,
                    n_actions,
                    n_hidden_channels=n_hidden_channels,
                    n_hidden_layers=2)
            opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=1e-2, alpha=0.99)
            opt.setup(q_func)
            if self.explorer == 'epsilon_greedy':
                explorer = chainerrl.explorers.ConstantEpsilonGreedy(
                    process_idx / 10, random_action_func)
            else:
                explorer = chainerrl.explorers.Boltzmann()

            return nsq.NSQ(q_func,
                           opt,
                           t_max=self.t_max,
                           gamma=0.9,
                           i_target=100,
                           explorer=explorer)

        with warnings.catch_warnings(record=True) as warns:
            agent = train_agent_async(
                outdir=self.outdir,
                processes=nproc,
                make_env=make_env,
                make_agent=make_agent,
                steps=steps,
                max_episode_len=5,
                eval_interval=500,
                eval_n_runs=5,
                successful_score=1,
            )
            # There should be no AbnormalExitWarning
            self.assertEqual(
                sum(1 if issubclass(w.category, async_.AbnormalExitWarning
                                    ) else 0 for w in warns), 0)

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        n_test_runs = 5
        env = make_env(0, True)
        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            print('test run offset:', env._offset)
            done = False
            r = 0.0

            while not done:
                action = agent.act(obs)
                print(('state:', obs, 'action:', action))
                obs, r, done, _ = env.step(action)
                total_r += r
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Exemplo n.º 4
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=1000000):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        eps = 1e-1 if discrete else 1e-2
        opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.9
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        train_agent_async(outdir=self.outdir,
                          processes=nproc,
                          make_env=make_env,
                          agent=agent,
                          steps=steps,
                          max_episode_len=max_episode_len,
                          eval_interval=500,
                          eval_n_runs=5,
                          successful_score=1)

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()