Exemplo n.º 1
0
Arquivo: train.py Projeto: xlnwel/d2rl
def main(env_config, model_config, agent_config, buffer_config, train=train):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config['precision'])

    create_model, Agent = pkg.import_agent(config=agent_config)
    Buffer = pkg.import_module('buffer', config=agent_config).Buffer

    use_ray = env_config.get('n_workers', 1) > 1
    if use_ray:
        import ray
        from utility.ray_setup import sigint_shutdown_ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config, force_envvec=True)
    eval_env_config = env_config.copy()
    if 'num_levels' in eval_env_config:
        eval_env_config['num_levels'] = 0
    if 'seed' in eval_env_config:
        eval_env_config['seed'] += 1000
    eval_env_config['n_workers'] = 1
    for k in list(eval_env_config.keys()):
        # pop reward hacks
        if 'reward' in k:
            eval_env_config.pop(k)
    eval_env = create_env(eval_env_config, force_envvec=True)

    def sigint_handler(sig, frame):
        signal.signal(sig, signal.SIG_IGN)
        env.close()
        eval_env.close()
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)

    models = create_model(model_config, env)

    buffer_config['n_envs'] = env.n_envs
    buffer_config['state_keys'] = models.state_keys
    buffer = Buffer(buffer_config)

    agent = Agent(config=agent_config, models=models, dataset=buffer, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             buffer=buffer_config))

    train(agent, env, eval_env, buffer)

    if use_ray:
        env.close()
        eval_env.close()
        ray.shutdown()
Exemplo n.º 2
0
Arquivo: train.py Projeto: xlnwel/d2rl
def main(env_config, model_config, agent_config, replay_config):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 1) > 1
    if use_ray:
        import ray
        from utility.ray_setup import sigint_shutdown_ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config)
    eval_env_config = env_config.copy()
    eval_env_config['n_workers'] = 1
    eval_env_config['n_envs'] = 1
    reward_key = [k for k in eval_env_config.keys() if 'reward' in k]
    [eval_env_config.pop(k) for k in reward_key]
    eval_env = create_env(eval_env_config, force_envvec=True)

    agent_config['N_UPDATES'] *= env_config['n_workers'] * env_config['n_envs']
    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    n_workers = env_config.get('n_workers', 1)
    n_envs = env_config.get('n_envs', 1)
    replay_config['n_envs'] = n_workers * n_envs
    replay_config['seqlen'] = env.max_episode_steps
    if getattr(models, 'state_keys', ()):
        replay_config['state_keys'] = list(models.state_keys)
    replay = create_replay(replay_config)
    replay.load_data()

    am = pkg.import_module('agent', config=agent_config)
    data_format = am.get_data_format(env=env,
                                     replay_config=replay_config,
                                     agent_config=agent_config,
                                     model=models)
    dataset = create_dataset(replay, env, data_format=data_format)

    agent = Agent(config=agent_config, models=models, dataset=dataset, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             replay=replay_config))

    train(agent, env, eval_env, replay)

    if use_ray:
        ray.shutdown()
Exemplo n.º 3
0
def main(env_config,
         model_config,
         agent_config,
         replay_config,
         n,
         record=False,
         size=(128, 128),
         video_len=1000,
         fps=30,
         save=False):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    algo_name = agent_config['algorithm']
    env_name = env_config['name']

    try:
        make_env = pkg.import_module('env', algo_name, place=-1).make_env
    except:
        make_env = None
    env_config.pop('reward_clip', False)
    env = create_env(env_config, env_fn=make_env)
    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=None, env=env)

    if n < env.n_envs:
        n = env.n_envs
    scores, epslens, video = evaluate(env,
                                      agent,
                                      n,
                                      record=record,
                                      size=size,
                                      video_len=video_len)
    pwc(f'After running {n} episodes',
        f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}',
        color='cyan')

    if record:
        save_video(f'{algo_name}-{env_name}', video, fps=fps)
    if use_ray:
        ray.shutdown()
Exemplo n.º 4
0
Arquivo: train.py Projeto: xlnwel/d2rl
def main(env_config, model_config, agent_config, replay_config):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config['precision'])

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config, make_env, force_envvec=True)
    eval_env_config = env_config.copy()
    eval_env_config['n_envs'] = 1
    eval_env_config['n_workers'] = 1
    eval_env = create_env(eval_env_config, make_env)

    replay_config['dir'] = agent_config['root_dir'].replace('logs', 'data')
    replay = create_replay(replay_config)
    replay.load_data()
    dtype = global_policy().compute_dtype
    data_format = pkg.import_module(
        'agent', config=agent_config).get_data_format(
            env=env,
            batch_size=agent_config['batch_size'],
            sample_size=agent_config['sample_size'],
            dtype=dtype)
    process = functools.partial(process_with_env,
                                env=env,
                                obs_range=[-.5, .5],
                                one_hot_action=True,
                                dtype=dtype)
    dataset = Dataset(replay, data_format, process)

    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=dataset, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             replay=replay_config))

    train(agent, env, eval_env, replay)
Exemplo n.º 5
0
Arquivo: train.py Projeto: xlnwel/d2rl
def main(env_config, model_config, agent_config, buffer_config):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config['precision'])

    create_model, Agent = pkg.import_agent(config=agent_config)
    Buffer = pkg.import_module('buffer', config=agent_config).Buffer

    use_ray = env_config.get('n_workers', 1) > 1
    if use_ray:
        import ray
        from utility.ray_setup import sigint_shutdown_ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config, force_envvec=True)
    eval_env_config = env_config.copy()
    eval_env_config['seed'] += 1000
    eval_env_config['n_workers'] = 1
    eval_env_config['n_envs'] = 1
    for k in list(eval_env_config.keys()):
        # pop reward hacks
        if 'reward' in k:
            eval_env_config.pop(k)
    eval_env = create_env(eval_env_config, force_envvec=True)

    models = create_model(model_config, env)

    buffer_config['n_envs'] = env.n_envs
    buffer = Buffer(buffer_config)

    agent = Agent(config=agent_config, models=models, dataset=buffer, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             buffer=buffer_config))

    train(agent, env, eval_env, buffer)

    if use_ray:
        import ray
        ray.shutdown()
Exemplo n.º 6
0
def main(env_config, model_config, agent_config, replay_config):
    gpus = tf.config.list_physical_devices('GPU')
    ray.init(num_cpus=os.cpu_count(), num_gpus=len(gpus))

    sigint_shutdown_ray()

    default_agent_config.update(agent_config)
    agent_config = default_agent_config

    replay = create_replay_center(replay_config)

    model_fn, Agent = pkg.import_agent(config=agent_config)
    am = pkg.import_module('actor', config=agent_config)
    fm = pkg.import_module('func', config=agent_config)

    monitor = fm.create_monitor(config=agent_config)

    Worker = am.get_worker_class(Agent)
    workers = []
    for wid in range(agent_config['n_workers']):
        worker = fm.create_worker(Worker=Worker,
                                  worker_id=wid,
                                  model_fn=model_fn,
                                  config=agent_config,
                                  model_config=model_config,
                                  env_config=env_config,
                                  buffer_config=replay_config)
        worker.prefill_replay.remote(replay)
        workers.append(worker)

    Evaluator = am.get_evaluator_class(Agent)
    evaluator = fm.create_evaluator(Evaluator=Evaluator,
                                    model_fn=model_fn,
                                    config=agent_config,
                                    model_config=model_config,
                                    env_config=env_config)

    Learner = am.get_learner_class(Agent)
    learner = fm.create_learner(Learner=Learner,
                                model_fn=model_fn,
                                replay=replay,
                                config=agent_config,
                                model_config=model_config,
                                env_config=env_config,
                                replay_config=replay_config)

    learner.start_learning.remote()
    [w.run.remote(learner, replay, monitor) for w in workers]
    evaluator.run.remote(learner, monitor)

    elapsed_time = 0
    interval = 10
    while not ray.get(monitor.is_over.remote()):
        time.sleep(interval)
        elapsed_time += interval
        if elapsed_time % agent_config['LOG_PERIOD'] == 0:
            monitor.record_train_stats.remote(learner)

    ray.get(learner.save.remote())

    ray.shutdown()
Exemplo n.º 7
0
Arquivo: train.py Projeto: xlnwel/d2rl
def main(env_config, model_config, agent_config, replay_config):
    ray.init(num_cpus=os.cpu_count(), num_gpus=1)

    sigint_shutdown_ray()

    default_agent_config.update(agent_config)
    agent_config = default_agent_config

    replay = create_replay_center(replay_config)

    model_fn, Agent = pkg.import_agent(config=agent_config)
    am = pkg.import_module('actor', config=agent_config)
    fm = pkg.import_module('func', config=agent_config)

    # create the monitor
    monitor = fm.create_monitor(config=agent_config)

    # create workers
    Worker = am.get_worker_class()
    workers = []
    for wid in range(agent_config['n_workers']):
        worker = fm.create_worker(Worker=Worker,
                                  worker_id=wid,
                                  config=agent_config,
                                  env_config=env_config,
                                  buffer_config=replay_config)
        worker.set_handler.remote(replay=replay)
        worker.set_handler.remote(monitor=monitor)
        workers.append(worker)

    # create the learner
    Learner = am.get_learner_class(Agent)
    learner = fm.create_learner(Learner=Learner,
                                model_fn=model_fn,
                                replay=replay,
                                config=agent_config,
                                model_config=model_config,
                                env_config=env_config,
                                replay_config=replay_config)
    learner.start_learning.remote()

    # create the evaluator
    Evaluator = am.get_evaluator_class(Agent)
    evaluator = fm.create_evaluator(Evaluator=Evaluator,
                                    model_fn=model_fn,
                                    config=agent_config,
                                    model_config=model_config,
                                    env_config=env_config)
    evaluator.run.remote(learner, monitor)

    Actor = am.get_actor_class(Agent)
    actors = []
    na = agent_config['n_actors']
    nw = agent_config['n_workers']
    assert nw % na == 0, f"n_workers({nw}) is not divisible by n_actors({na})"
    wpa = nw // na
    for aid in range(agent_config['n_actors']):
        actor = fm.create_actor(Actor=Actor,
                                actor_id=aid,
                                model_fn=model_fn,
                                config=agent_config,
                                model_config=model_config,
                                env_config=env_config)
        actor.start.remote(workers[aid * wpa:(aid + 1) * wpa], learner,
                           monitor)
        actors.append(actor)

    elapsed_time = 0
    interval = 10
    # put the main thead into sleep
    # the monitor records training stats once in a while
    while not ray.get(monitor.is_over.remote()):
        time.sleep(interval)
        elapsed_time += interval
        if elapsed_time % agent_config['LOG_PERIOD'] == 0:
            monitor.record_train_stats.remote(learner)

    ray.get(learner.save.remote())

    ray.shutdown()
Exemplo n.º 8
0
def main(env_config,
         model_config,
         agent_config,
         replay_config,
         n,
         record=False,
         size=(128, 128),
         video_len=1000,
         fps=30,
         save=False):
    logging.basicConfig(level=logging.DEBUG)
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    algo_name = agent_config['algorithm']
    env_name = env_config['name']

    if record:
        env_config['log_episode'] = True
        env_config['n_workers'] = env_config['n_envs'] = 1

    env = create_env(env_config)

    create_model, Agent = pkg.import_agent(config=agent_config)

    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=None, env=env)

    if save:
        n_workers = env_config.get('n_workers', 1)
        n_envs = env_config.get('n_envs', 1)
        replay_config['n_envs'] = n_workers * n_envs
        replay_config['replay_type'] = 'uniform'
        replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}'
        replay_config['n_steps'] = 1
        replay_config['save'] = True
        replay_config['save_temp'] = True
        replay_config['capacity'] = int(1e6)
        replay_config['has_next_obs'] = True
        replay = create_replay(replay_config)

        def collect(obs, action, reward, discount, next_obs, logpi, **kwargs):
            replay.add(obs=obs,
                       action=action,
                       reward=reward,
                       discount=discount,
                       next_obs=next_obs,
                       logpi=logpi)
    else:

        def collect(**kwargs):
            pass

    if n < env.n_envs:
        n = env.n_envs
    scores, epslens, video = evaluate(env,
                                      agent,
                                      n,
                                      record=record,
                                      size=size,
                                      video_len=video_len,
                                      step_fn=collect)
    pwc(f'After running {n} episodes',
        f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}',
        color='cyan')

    if save:
        replay.save()

    if record:
        save_video(f'{algo_name}-{env_name}', video, fps=fps)
    if use_ray:
        ray.shutdown()