Пример #1
0
def run(config, seed, device):
    set_global_seeds(seed)
    logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
    
    env = make_env(config, seed)
    env = VecMonitor(env)
    if config['env.standardize_obs']:
        env = VecStandardizeObservation(env, clip=5.)
    if config['env.standardize_reward']:
        env = VecStandardizeReward(env, clip=10., gamma=config['agent.gamma'])
    
    agent = Agent(config, env, device)
    runner = EpisodeRunner(reset_on_call=False)
    engine = Engine(config, agent=agent, env=env, runner=runner)
    train_logs = []
    for i in count():
        if agent.total_timestep >= config['train.timestep']:
            break
        train_logger = engine.train(i)
        train_logs.append(train_logger.logs)
        if i == 0 or (i+1) % config['log.freq'] == 0:
            train_logger.dump(keys=None, index=0, indent=0, border='-'*50)
        if i == 0 or (i+1) % config['checkpoint.freq'] == 0:
            agent.checkpoint(logdir, i + 1)
    agent.checkpoint(logdir, i + 1)
    pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
    return None
Пример #2
0
def test_episode_runner(env_id, num_env, init_seed, T):    
    if env_id == 'Sanity':
        make_env = lambda: TimeLimit(SanityEnv())
    else:
        make_env = lambda: gym.make(env_id)
    env = make_vec_env(make_env, num_env, init_seed)
    env = VecStepInfo(env)
    agent = RandomAgent(None, env, None)
    runner = EpisodeRunner()
    
    if num_env > 1:
        with pytest.raises(AssertionError):
            D = runner(agent, env, T)
    else:
        with pytest.raises(AssertionError):
            runner(agent, env.env, T)  # must be VecStepInfo
        D = runner(agent, env, T)
        for traj in D:
            assert isinstance(traj, Trajectory)
            assert len(traj) <= env.spec.max_episode_steps
            assert traj.numpy_observations.shape == (len(traj) + 1, *env.observation_space.shape)
            if isinstance(env.action_space, gym.spaces.Discrete):
                assert traj.numpy_actions.shape == (len(traj),)
            else:
                assert traj.numpy_actions.shape == (len(traj), *env.action_space.shape)
            assert traj.numpy_rewards.shape == (len(traj),)
            assert traj.numpy_dones.shape == (len(traj), )
            assert traj.numpy_masks.shape == (len(traj), )
            assert len(traj.step_infos) == len(traj)
            if traj.completed:
                assert np.allclose(traj.observations[-1], traj.step_infos[-1]['last_observation'])
Пример #3
0
    def eval(self, n):
        self.agent.eval()

        start_time = time()

        if self.config['env.standardize']:
            eval_env = VecStandardize(
                venv=self.eval_env,
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=self.runner.env.clip_obs,
                clip_reward=self.runner.env.clip_reward,
                gamma=self.runner.env.gamma,
                eps=self.runner.env.eps,
                constant_obs_mean=self.runner.env.obs_runningavg.mu,
                constant_obs_std=self.runner.env.obs_runningavg.sigma)
        eval_runner = EpisodeRunner(self.config, self.agent, eval_env)
        T = eval_env.T
        D = eval_runner(T)

        eval_output = {}
        eval_output['D'] = D
        eval_output['n'] = n
        eval_output['T'] = T
        eval_output['num_sec'] = time() - start_time

        return eval_output
Пример #4
0
    def eval(self, n):
        self.agent.eval()
        eval_runner = EpisodeRunner(self.config, self.agent, self.eval_env)
        T = self.eval_env.T
        D = eval_runner(T)

        eval_output = {}
        eval_output['D'] = D
        eval_output['n'] = n
        eval_output['T'] = T

        return eval_output
Пример #5
0
def test_episode_runner(vec_env, env_id):
    env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0)
    env_spec = EnvSpec(env)

    if env_id == 'CartPole-v1':
        sticky_action = 1
    elif env_id == 'Pendulum-v0':
        sticky_action = [0.1]

    T = 30

    agent = StickyAgent(None, env_spec, sticky_action)
    runner = EpisodeRunner(None, agent, env)
    D = runner(T)

    assert D.N == 3
    assert D.maxT == max(D.Ts)

    seeder = Seeder(0)
    seed1, seed2, seed3 = seeder(3)
    env1 = make_gym_env(env_id, seed1)
    env2 = make_gym_env(env_id, seed2)
    env3 = make_gym_env(env_id, seed3)

    for n, ev in enumerate([env1, env2, env3]):
        obs = ev.reset()
        assert np.allclose(obs, D.observations[n][0])
        assert np.allclose(obs, D.numpy_observations[n, 0, ...])
        for t in range(T):
            obs, reward, done, info = ev.step(sticky_action)

            assert np.allclose(reward, D.rewards[n][t])
            assert np.allclose(reward, D.numpy_rewards[n, t])
            assert np.allclose(done, D.dones[n][t])
            assert done == D.numpy_dones[n, t]
            assert int(not done) == D.masks[n][t]
            assert int(not done) == D.numpy_masks[n, t]

            if done:
                assert np.allclose(obs, D.infos[n][t]['terminal_observation'])
                assert D.completes[n]
                assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:,
                                                             ...])
                assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...])
                assert np.allclose(0.0, D.numpy_rewards[n, t + 1:])
                assert np.allclose(True, D.numpy_dones[n, t + 1:])
                assert np.allclose(0.0, D.numpy_masks[n, t + 1:])
                break
            else:
                assert np.allclose(obs, D.observations[n][t + 1])
Пример #6
0
    def f(self, config, solution):
        if self.agent is None:
            self._prepare(config)

        solution = torch.from_numpy(np.asarray(solution)).float().to(
            self.device)
        assert solution.numel() == self.agent.num_params

        # Load solution params to agent
        self.agent.from_vec(solution)

        runner = EpisodeRunner(config, self.agent, self.env)
        with torch.no_grad():
            D = runner(self.env_spec.T)
        mean_return = D.numpy_rewards.sum(-1).mean()

        # ES does minimization, so use negative returns
        function_value = -mean_return

        return function_value
Пример #7
0
    def __call__(self, config, seed, device):
        set_global_seeds(seed)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        if config['env.time_aware_obs']:
            kwargs = {'extra_wrapper': [TimeAwareObservation]}
        else:
            kwargs = {}
        env = make_vec_env(SerialVecEnv,
                           make_gym_env,
                           config['env.id'],
                           config['train.N'],
                           seed,
                           monitor=True,
                           **kwargs)
        if config['eval.independent']:
            eval_env = make_vec_env(SerialVecEnv, make_gym_env,
                                    config['env.id'], config['eval.N'], seed)
        if config['env.clip_action']:
            env = VecClipAction(env)
            if config['eval.independent']:
                eval_env = VecClipAction(eval_env)
        if config[
                'env.standardize']:  # running averages of observation and reward
            env = VecStandardize(venv=env,
                                 use_obs=True,
                                 use_reward=True,
                                 clip_obs=10.,
                                 clip_reward=10.,
                                 gamma=0.99,
                                 eps=1e-8)
        env_spec = EnvSpec(env)

        agent = Agent(config, env_spec, device)

        runner = EpisodeRunner(config, agent, env)

        if config['eval.independent']:
            engine = Engine(agent, runner, config, eval_env=eval_env)
        else:
            engine = Engine(agent, runner, config)

        train_logs = []
        eval_logs = []
        for i in count():
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            train_output = engine.train(i)

            if i == 0 or (i + 1) % config['log.interval'] == 0:
                train_log = engine.log_train(train_output)
                train_logs.append(train_log)

                if config['eval.independent']:
                    with torch.no_grad():  # disable grad, save memory
                        eval_output = engine.eval(n=i)
                    eval_log = engine.log_eval(eval_output)
                    eval_logs.append(eval_log)

        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None