def main():
    num_env = 1
    env_id = "CartPole-v1"
    env_type = "classic_control"
    seed = None

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=None,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = acktr.learn(env,
                      network='mlp',
                      total_timesteps=0,
                      load_path="cartpole_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Пример #2
0
def main():
    num_env = 1
    env_id = "Pendulum-v0"
    env_type = "classic_control"
    seed = 1

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=None,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = acktr.learn(env=env, network='mlp', total_timesteps=50000, seed=seed)
    print("Saving model to pendulum_model.pkl")
    act.save("pendulum_model.pkl")
Пример #3
0
              + 'value_network = ' + alg_kwargs['value_network'] + '\n' +
              'lrschedule = ' + alg_kwargs['lrschedule'] + '\n' +
              'log_interval = ' + str(alg_kwargs['log_interval']) + '\n' +
              'save_interval = ' + str(alg_kwargs['save_interval']) + '\n' +
              'env_name = ' + alg_kwargs['env_name'] + '\n' +
              'transfer_path = ' + str(alg_kwargs['transfer_path']))

env = DummyVecEnv([make_env])
transfer_path = alg_kwargs['transfer_path']

# Remove unused parameters for training
alg_kwargs.pop('env_name')
alg_kwargs.pop('trained_path')
alg_kwargs.pop('transfer_path')

network = mlp(num_layers=alg_kwargs['num_layers'],
              num_hidden=alg_kwargs['num_hidden'],
              layer_norm=alg_kwargs['layer_norm'])

if transfer_path is not None:
    # Do transfer learning
    _ = acktr.learn(env=env,
                    network=network,
                    load_path=transfer_path,
                    **alg_kwargs)
else:
    _ = acktr.learn(env=env, network=network, **alg_kwargs)

env.dummy().gg2().close()
os.kill(os.getpid(), 9)
def main():
    parser = argparse.ArgumentParser(description="Process training arguments.")
    parser.add_argument('--config',
                        type=str,
                        default="configurations/ppo_baseline_cuda.yaml",
                        help="config file name (located in config dir)")
    args = parser.parse_args()

    # create configuration
    cfg = get_cfg_defaults()
    cfg.merge_from_file(args.config)

    print(cfg.TRAIN.TOTAL_TIMESTEPS)

    # create experiment directory
    exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    os.makedirs(exp_dir, exist_ok=True)

    # create logger
    format_strs = ['csv', 'stdout']
    logger.configure(dir=exp_dir,
                     format_strs=format_strs,
                     log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M'))

    # create (vectorized) procgen environment
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=cfg.TRAIN.NUM_ENVS,
                      env_name="fruitbot",
                      num_levels=cfg.TRAIN.NUM_LEVELS,
                      start_level=cfg.TRAIN.LEVEL_SEED,
                      distribution_mode="easy")
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    test_venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS,
                           env_name="fruitbot",
                           num_levels=cfg.TEST.NUM_LEVELS,
                           start_level=cfg.TEST.LEVEL_SEED,
                           distribution_mode="easy")
    test_venv = VecExtractDictObs(test_venv, "rgb")
    test_venv = VecMonitor(
        venv=test_venv,
        filename=None,
        keep_buf=100,
    )
    test_venv = VecNormalize(venv=test_venv, ob=False)

    # create tensorflow session
    logger.info("creating tf session")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # create cnn todo: make this less ugly
    conv_fn = None
    logger.info("building cnn")
    if cfg.TRAIN.NETWORK == "NATURE_CNN":
        conv_fn = lambda x: nature_cnn(x)
    elif cfg.TRAIN.NETWORK == "IMPALA_CNN":
        conv_fn = lambda x: build_impala_cnn(
            x, depths=[16, 32, 32], emb_size=256)

    # training
    logger.info("training")
    if cfg.TRAIN.POLICY == "A2C":
        a2c.learn(env=venv,
                  network=conv_fn,
                  total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                  nsteps=cfg.TRAIN.BATCH_SIZE,
                  log_interval=1,
                  eval_env=test_venv,
                  augment=cfg.TRAIN.AUGMENT)
    elif cfg.TRAIN.POLICY == "ACKTR":
        acktr.learn(env=venv,
                    network=conv_fn,
                    total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                    nsteps=cfg.TRAIN.BATCH_SIZE,
                    log_interval=1,
                    eval_env=test_venv,
                    augment=cfg.TRAIN.AUGMENT,
                    seed=None)
    elif cfg.TRAIN.POLICY == "PPO":
        ppo2.learn(env=venv,
                   eval_env=test_venv,
                   network=conv_fn,
                   total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS,
                   save_interval=5,
                   nsteps=cfg.TRAIN.BATCH_SIZE,
                   nminibatches=cfg.TRAIN.MINIBATCHES,
                   lam=cfg.TRAIN.LAM,
                   gamma=cfg.TRAIN.GAMMA,
                   noptepochs=cfg.TRAIN.NUM_EPOCHS,
                   log_interval=1,
                   clip_vf=cfg.TRAIN.USE_VF_CLIPPING,
                   lr=cfg.TRAIN.LR,
                   cliprange=cfg.TRAIN.CLIP_RANGE,
                   update_fn=None,
                   init_fn=None,
                   vf_coef=0.5,
                   max_grad_norm=0.5,
                   augment=cfg.TRAIN.AUGMENT,
                   load_path=cfg.TRAIN.PRETRAINED)
Пример #5
0
# Generate tensorboard file
format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',')
logger.configure(os.path.abspath('/tmp/acktr'), format_strs)

env = DummyVecEnv([make_env])

# Remove unused parameters for training
alg_kwargs.pop('env_name')
alg_kwargs.pop('trained_path')
alg_kwargs.pop('transfer_path')

network = mlp(num_layers=alg_kwargs['num_layers'], num_hidden=alg_kwargs['num_hidden'], layer_norm=alg_kwargs['layer_norm'])

with tf.Session(config=config) as train_sess:
    _ = acktr.learn(env=env, network=network, **alg_kwargs)

tf.reset_default_graph()

savedir = "/tmp/acktr/checkpoints/00001"
exists = os.path.isfile(savedir)
if not exists:
    raise AssertionError("Trained NN is missing")

env.dummy().gg2().close()

env = DummyVecEnv([make_env])
policy = build_policy(env, network, **alg_kwargs)

with tf.Session(config=config) as run_sess:
    make_model = lambda : acktr.Model(policy, env.observation_space, env.action_space, env.num_envs,