예제 #1
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)

    queue = mp.Queue(maxsize=100)
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, device)
    agent.share_memory()
    runner = EpisodeRunner(reset_on_call=False)
    engine = Engine(config, agent=agent, env=env, runner=runner)

    learner_process = mp.Process(target=learner,
                                 args=(config, logdir, agent, engine, queue))
    actor_processes = [
        mp.Process(target=actor,
                   args=(config, seed, make_env, agent, runner, queue))
        for _ in range(config['agent.num_actors'])
    ]
    evaluator_process = mp.Process(target=evaluator,
                                   args=(config, logdir, seed, make_env,
                                         agent))

    learner_process.start()
    print('Learner started !')
    [p.start() for p in actor_processes]
    print('Actors started !')
    evaluator_process.start()
    print('Evaluator started !')
    evaluator_process.join()
    [p.join() for p in actor_processes]
    learner_process.join()
    return None
예제 #2
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)

    env = make_env(config, seed, 'train')
    eval_env = make_env(config, seed, 'eval')
    random_agent = RandomAgent(config, env, device)
    if config['agent.use_td3']:
        agent = TD3Agent(config, env, device)
    else:
        agent = DDPGAgent(config, env, device)
    runner = EpisodeRunner()
    replay = ReplayBuffer(env, config['replay.capacity'], device)
    engine = Engine(config,
                    agent=agent,
                    random_agent=random_agent,
                    env=env,
                    eval_env=eval_env,
                    runner=runner,
                    replay=replay,
                    logdir=logdir)

    train_logs, eval_logs = engine.train()
    pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
    pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')
    return None
예제 #3
0
def fitness(data):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    config, seed, device, param = data
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, device)
    agent.from_vec(tensorify(param, 'cpu'))
    runner = EpisodeRunner()
    with torch.no_grad():
        D = runner(agent, env, 10)
    R = np.mean([sum(traj.rewards) for traj in D])
    H = np.mean([traj.T for traj in D])
    return R, H
예제 #4
0
def test_episode_runner(env_id, N):
    env = gym.make(env_id)
    env = TimeStepEnv(env)
    agent = RandomAgent(None, env, None)
    runner = EpisodeRunner()
    D = runner(agent, env, N)
    assert len(D) == N
    assert all([isinstance(d, Trajectory) for d in D])
    assert all([traj.finished for traj in D])
    assert all([traj[0].first() for traj in D])
    assert all([traj[-1].last() for traj in D])
    for traj in D:
        for timestep in traj[1:-1]:
            assert timestep.mid()
예제 #5
0
def evaluator(config, logdir, seed, make_env, learner_agent):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    eval_logs = []
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, torch.device('cpu'))
    runner = EpisodeRunner(reset_on_call=True)
    evaluated_steps = config['eval.freq']
    while learner_agent.total_timestep < config['train.timestep']:
        if learner_agent.total_timestep < evaluated_steps:
            time.sleep(1.0)
        else:
            t0 = time.perf_counter()
            agent.load_state_dict(
                learner_agent.state_dict())  # copy to CPU by default
            with torch.no_grad():
                D = []
                for _ in range(config['eval.num_episode']):
                    D += runner(agent, env, env.spec.max_episode_steps)
            logger = Logger()
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('num_trajectories', len(D))
            logger('num_timesteps', sum([len(traj) for traj in D]))
            logger('accumulated_trained_timesteps',
                   learner_agent.total_timestep)

            infos = [
                info
                for info in chain.from_iterable([traj.infos for traj in D])
                if 'episode' in info
            ]
            online_returns = [info['episode']['return'] for info in infos]
            online_horizons = [info['episode']['horizon'] for info in infos]
            logger(
                'online_return',
                describe(online_returns,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'online_horizon',
                describe(online_horizons,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))

            monitor_env = get_wrapper(env, 'VecMonitor')
            logger(
                'running_return',
                describe(monitor_env.return_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'running_horizon',
                describe(monitor_env.horizon_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger.dump(keys=None,
                        index=0,
                        indent=0,
                        border=color_str('+' * 50, color='green'))
            eval_logs.append(logger.logs)

            evaluated_steps += config['eval.freq']
    pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')