def main(): num_env = 1 env_id = "CartPole-v1" env_type = "classic_control" seed = None env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = acktr.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pkl") while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): num_env = 1 env_id = "Pendulum-v0" env_type = "classic_control" seed = 1 env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = acktr.learn(env=env, network='mlp', total_timesteps=50000, seed=seed) print("Saving model to pendulum_model.pkl") act.save("pendulum_model.pkl")
+ 'value_network = ' + alg_kwargs['value_network'] + '\n' + 'lrschedule = ' + alg_kwargs['lrschedule'] + '\n' + 'log_interval = ' + str(alg_kwargs['log_interval']) + '\n' + 'save_interval = ' + str(alg_kwargs['save_interval']) + '\n' + 'env_name = ' + alg_kwargs['env_name'] + '\n' + 'transfer_path = ' + str(alg_kwargs['transfer_path'])) env = DummyVecEnv([make_env]) transfer_path = alg_kwargs['transfer_path'] # Remove unused parameters for training alg_kwargs.pop('env_name') alg_kwargs.pop('trained_path') alg_kwargs.pop('transfer_path') network = mlp(num_layers=alg_kwargs['num_layers'], num_hidden=alg_kwargs['num_hidden'], layer_norm=alg_kwargs['layer_norm']) if transfer_path is not None: # Do transfer learning _ = acktr.learn(env=env, network=network, load_path=transfer_path, **alg_kwargs) else: _ = acktr.learn(env=env, network=network, **alg_kwargs) env.dummy().gg2().close() os.kill(os.getpid(), 9)
def main(): parser = argparse.ArgumentParser(description="Process training arguments.") parser.add_argument('--config', type=str, default="configurations/ppo_baseline_cuda.yaml", help="config file name (located in config dir)") args = parser.parse_args() # create configuration cfg = get_cfg_defaults() cfg.merge_from_file(args.config) print(cfg.TRAIN.TOTAL_TIMESTEPS) # create experiment directory exp_dir = f"runs/{cfg.EXPERIMENT_NAME}/{datetime.now().strftime('%Y-%m-%d-%H-%M')}" os.makedirs(exp_dir, exist_ok=True) # create logger format_strs = ['csv', 'stdout'] logger.configure(dir=exp_dir, format_strs=format_strs, log_suffix=datetime.now().strftime('%Y-%m-%d-%H-%M')) # create (vectorized) procgen environment logger.info("creating environment") venv = ProcgenEnv(num_envs=cfg.TRAIN.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TRAIN.NUM_LEVELS, start_level=cfg.TRAIN.LEVEL_SEED, distribution_mode="easy") venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) test_venv = ProcgenEnv(num_envs=cfg.TEST.NUM_ENVS, env_name="fruitbot", num_levels=cfg.TEST.NUM_LEVELS, start_level=cfg.TEST.LEVEL_SEED, distribution_mode="easy") test_venv = VecExtractDictObs(test_venv, "rgb") test_venv = VecMonitor( venv=test_venv, filename=None, keep_buf=100, ) test_venv = VecNormalize(venv=test_venv, ob=False) # create tensorflow session logger.info("creating tf session") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # create cnn todo: make this less ugly conv_fn = None logger.info("building cnn") if cfg.TRAIN.NETWORK == "NATURE_CNN": conv_fn = lambda x: nature_cnn(x) elif cfg.TRAIN.NETWORK == "IMPALA_CNN": conv_fn = lambda x: build_impala_cnn( x, depths=[16, 32, 32], emb_size=256) # training logger.info("training") if cfg.TRAIN.POLICY == "A2C": a2c.learn(env=venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, nsteps=cfg.TRAIN.BATCH_SIZE, log_interval=1, eval_env=test_venv, augment=cfg.TRAIN.AUGMENT) elif cfg.TRAIN.POLICY == "ACKTR": acktr.learn(env=venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, nsteps=cfg.TRAIN.BATCH_SIZE, log_interval=1, eval_env=test_venv, augment=cfg.TRAIN.AUGMENT, seed=None) elif cfg.TRAIN.POLICY == "PPO": ppo2.learn(env=venv, eval_env=test_venv, network=conv_fn, total_timesteps=cfg.TRAIN.TOTAL_TIMESTEPS, save_interval=5, nsteps=cfg.TRAIN.BATCH_SIZE, nminibatches=cfg.TRAIN.MINIBATCHES, lam=cfg.TRAIN.LAM, gamma=cfg.TRAIN.GAMMA, noptepochs=cfg.TRAIN.NUM_EPOCHS, log_interval=1, clip_vf=cfg.TRAIN.USE_VF_CLIPPING, lr=cfg.TRAIN.LR, cliprange=cfg.TRAIN.CLIP_RANGE, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, augment=cfg.TRAIN.AUGMENT, load_path=cfg.TRAIN.PRETRAINED)
# Generate tensorboard file format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',') logger.configure(os.path.abspath('/tmp/acktr'), format_strs) env = DummyVecEnv([make_env]) # Remove unused parameters for training alg_kwargs.pop('env_name') alg_kwargs.pop('trained_path') alg_kwargs.pop('transfer_path') network = mlp(num_layers=alg_kwargs['num_layers'], num_hidden=alg_kwargs['num_hidden'], layer_norm=alg_kwargs['layer_norm']) with tf.Session(config=config) as train_sess: _ = acktr.learn(env=env, network=network, **alg_kwargs) tf.reset_default_graph() savedir = "/tmp/acktr/checkpoints/00001" exists = os.path.isfile(savedir) if not exists: raise AssertionError("Trained NN is missing") env.dummy().gg2().close() env = DummyVecEnv([make_env]) policy = build_policy(env, network, **alg_kwargs) with tf.Session(config=config) as run_sess: make_model = lambda : acktr.Model(policy, env.observation_space, env.action_space, env.num_envs,