def test_train_1_episode(): env = gym.make('CartPole-v0') n_state_dims = env.observation_space.shape[0] n_action_dims = env.action_space.n run_one_episode = partial(util.run_one_episode, env) policy_network = util.PolicyNetwork(n_state_dims, n_action_dims, hidden_units=[]) _policy, _scores = ppo.train(run_one_episode, policy_network, n_episodes=1)
def test_train_pong_long(n_episodes=1000, batchnorm=False, device=device, render=False, **kwargs): env = gym.make('PongDeterministic-v4') run_one_episode = partial(pong.run_one_episode, env, render=render) policy_network = pong.PolicyNetwork(batchnorm=batchnorm, device=device) _policy, _scores = ppo.train(run_one_episode, policy_network, n_episodes=n_episodes, **kwargs)
def test_train_3k(): torch.manual_seed(0) env = gym.make('CartPole-v0') n_state_dims = env.observation_space.shape[0] n_action_dims = env.action_space.n run_one_episode = partial(util.run_one_episode, env) policy_network = util.PolicyNetwork(n_state_dims, n_action_dims, hidden_units=[16]) _policy, scores = ppo.train(run_one_episode, policy_network, n_episodes=3000, alpha=1e-3, gamma=1., entropy_beta=0.01, weight_decay=0) assert np.mean(scores[-100:]) > 195.
def main(): args = mujoco_arg_parser() logger.configure(dir=args.logdir) nenv = 1 envs = [] for i in range(nenv): e = gym.make(args.env) e.seed(args.seed + i) #for repeatability e = Monitor(e, logger.get_dir(), allow_early_resets=True) envs.append(e) envs = DummyVecEnv(envs) envs = VecNormalize(envs) set_global_seeds(args.seed) #for repeatability agent = MlpAgent(envs.observation_space.shape[0], envs.action_space.shape[0]) if args.checkpoint: agent.load_state_dict(torch.load(args.checkpoint)) agent = train(agent, envs, N_steps=2048, N_updates=args.updates, batch_size=128, lam=0.95, gamma=0.99, N_train_sample_epochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, save_interval=100) if args.play: logger.log("Running trained model") obs = np.zeros((envs.num_envs, ) + envs.observation_space.shape) obs[:] = envs.reset() while True: actions = agent.step(obs)[0] obs[:] = envs.step(actions)[0] envs.render()
def test_train_pong_1_episode(): env = gym.make('PongDeterministic-v4') run_one_episode = partial(pong.run_one_episode, env, render=True) policy_network = pong.PolicyNetwork() _policy, _scores = ppo.train(run_one_episode, policy_network, n_episodes=1)
show_demo = False #whether or not to make a video of the current progress max_steps_in_demo_episode = 200 #number of steps to show in demo episode #environement parameters starting_floor = 0 total_floors = 1 worker_id = 1 env = create_env(starting_floor, total_floors, worker_id) policy_actions = unpickle_object( 'action_map') #map going grom actions to env actions override_threshold = 2000 #score used to determine if agent is stuck #deep learning setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = ActorCritic(len(policy_actions)).to(device) #model = unpickle_object('policy_model') optimizer = optim.Adam(model.parameters(), lr=learning_rate) ######################################################################## """ Train the model. """ experiment = Experiment(api_key="47QJ41M89a6zNXZgS9sY6NQfI", project_name="unity", workspace="wbarich") model = train(env, model, gamma, max_epochs, batch_size, epochs_before_printing, mini_batch_size, ppo_epochs, policy_actions, device, optimizer, max_steps_in_demo_episode, show_demo, override_threshold, experiment) ########################################################################
import ppo if __name__ == "__main__": train = True name = "LunarLander-v2" actor_size = 32 critic_size = 256 if train: env = ppo.train( environment_name=name, gamma=0.99, clip_ratio=0.1, pi_lr=2e-3, vf_lr=2e-3, k_epochs=4, update_every_j_timestep=32, max_episode_length=2_000, max_steps=500, critic_hidden_size=critic_size, actor_hidden_size=actor_size, render=False, random_seed=1, solved_reward=249, observationNormalization=False, actorGradientNormalization=0, normalizeAdvantage=False, initialization="orthogonal", # normal = normal distribution, None advantageAlgorithm= "GAE", # None = use A2C reward calculation o "GAE" # pathForBasePolicyToTrain=f"./model/ppo_{name}_policy_latest.pth", # pathForBaseCriticToTrain=f"./model/ppo_{name}_critic_latest.pth", coeficient_entropy=0.01,
import ppo if __name__ == "__main__": train = True observationNormalization = False if train: env = ppo.train( environment_name="CartPole-v0", solved_reward=200, gamma=0.99, clip_ratio=0.25, pi_lr=0.000_3, vf_lr=0.000_3, k_epochs=4, update_every_j_timestep=100, max_episode_length=500, max_steps=210, critic_hidden_size=200, actor_hidden_size=16, render=False, random_seed=1, observationNormalization=observationNormalization, actorGradientNormalization=5, advantageAlgorithm="GAE", ) ppo.play_latest("CartPole-v0", 16, plot=False, observationNormalization=observationNormalization)