예제 #1
0
 def eval(self, n=None, **kwargs):
     start_time = perf_counter()
     returns = []
     horizons = []
     for _ in range(self.config['eval.num_episode']):
         observation = self.eval_env.reset()
         for _ in range(self.eval_env.spec.max_episode_steps):
             with torch.no_grad():
                 action = self.agent.choose_action(observation, mode='eval')['action']
             next_observation, reward, done, info = self.eval_env.step(action)
             if done[0]:  # [0] single environment
                 returns.append(info[0]['episode']['return'])
                 horizons.append(info[0]['episode']['horizon'])
                 break
             observation = next_observation
     logger = Logger()
     logger('num_seconds', round(perf_counter() - start_time, 1))
     logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
     logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
     logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
     
     monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
     logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
     return logger.logs
예제 #2
0
 def checkpoint(self, logdir, num_iter):
     self.save(logdir / f'agent_{num_iter}.pth')
     obs_env = get_wrapper(self.env, 'VecStandardizeObservation')
     if obs_env is not None:
         pickle_dump(obj=(obs_env.mean, obs_env.var),
                     f=logdir / f'obs_moments_{num_iter}',
                     ext='.pth')
예제 #3
0
파일: test_envs.py 프로젝트: jlqzzz/lagom
def test_get_wrapper(env_id):
    def make_env():
        return gym.make(env_id)

    env = make_env()
    env = ClipReward(env, 0.1, 0.5)
    env = FlattenObservation(env)
    env = FrameStack(env, 4)

    assert get_wrapper(env, 'ClipReward').__class__.__name__ == 'ClipReward'
    assert get_wrapper(
        env, 'FlattenObservation').__class__.__name__ == 'FlattenObservation'
    assert get_wrapper(env, 'Env') is None

    del env

    # vec_env
    env = make_vec_env(make_env, 3, 0)
    env = VecMonitor(env)
    assert get_wrapper(env, 'VecMonitor').__class__.__name__ == 'VecMonitor'
    assert get_wrapper(env, 'ClipReward') is None
예제 #4
0
    def train(self, n=None, **kwargs):
        self.agent.train()
        start_time = perf_counter()

        D = self.runner(self.agent, self.env,
                        self.config['train.timestep_per_iter'])
        out_agent = self.agent.learn(D)

        logger = Logger()
        logger('train_iteration', n + 1)
        logger('num_seconds', round(perf_counter() - start_time, 1))
        [logger(key, value) for key, value in out_agent.items()]
        logger('num_trajectories', len(D))
        logger('num_timesteps', sum([len(traj) for traj in D]))
        logger('accumulated_trained_timesteps', self.agent.total_timestep)
        G = [traj.numpy_rewards.sum() for traj in D]
        logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n'))

        infos = [
            info for info in chain.from_iterable([traj.infos for traj in D])
            if 'episode' in info
        ]
        online_returns = [info['episode']['return'] for info in infos]
        online_horizons = [info['episode']['horizon'] for info in infos]
        logger(
            'online_return',
            describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n'))
        logger(
            'online_horizon',
            describe(online_horizons, axis=-1, repr_indent=1,
                     repr_prefix='\n'))

        monitor_env = get_wrapper(self.env, 'VecMonitor')
        logger(
            'running_return',
            describe(monitor_env.return_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        logger(
            'running_horizon',
            describe(monitor_env.horizon_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        return logger
예제 #5
0
def evaluator(config, logdir, seed, make_env, learner_agent):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    eval_logs = []
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, torch.device('cpu'))
    runner = EpisodeRunner(reset_on_call=True)
    evaluated_steps = config['eval.freq']
    while learner_agent.total_timestep < config['train.timestep']:
        if learner_agent.total_timestep < evaluated_steps:
            time.sleep(1.0)
        else:
            t0 = time.perf_counter()
            agent.load_state_dict(
                learner_agent.state_dict())  # copy to CPU by default
            with torch.no_grad():
                D = []
                for _ in range(config['eval.num_episode']):
                    D += runner(agent, env, env.spec.max_episode_steps)
            logger = Logger()
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('num_trajectories', len(D))
            logger('num_timesteps', sum([len(traj) for traj in D]))
            logger('accumulated_trained_timesteps',
                   learner_agent.total_timestep)

            infos = [
                info
                for info in chain.from_iterable([traj.infos for traj in D])
                if 'episode' in info
            ]
            online_returns = [info['episode']['return'] for info in infos]
            online_horizons = [info['episode']['horizon'] for info in infos]
            logger(
                'online_return',
                describe(online_returns,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'online_horizon',
                describe(online_horizons,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))

            monitor_env = get_wrapper(env, 'VecMonitor')
            logger(
                'running_return',
                describe(monitor_env.return_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'running_horizon',
                describe(monitor_env.horizon_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger.dump(keys=None,
                        index=0,
                        indent=0,
                        border=color_str('+' * 50, color='green'))
            eval_logs.append(logger.logs)

            evaluated_steps += config['eval.freq']
    pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')